mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-06 17:14:07 +00:00
Compare commits
9 Commits
b4863
...
gg/infill-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ed58975f51 | ||
|
|
6ab2e4765a | ||
|
|
96e1280839 | ||
|
|
2c9f833d17 | ||
|
|
251364549f | ||
|
|
8acdacb3ea | ||
|
|
89b2b56e86 | ||
|
|
e128a1bf5b | ||
|
|
6ef79a67ca |
2
Makefile
2
Makefile
@@ -836,7 +836,7 @@ ifdef GGML_MUSA
|
||||
else
|
||||
MUSA_PATH ?= /opt/musa
|
||||
endif
|
||||
MUSA_ARCHITECTURES ?= 21;22
|
||||
MUSA_ARCHITECTURES ?= 21;22;31
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
|
||||
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
|
||||
|
||||
@@ -172,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
|
||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
|
||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
|
||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
||||
|
||||
@@ -1867,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
||||
add_opt(common_arg(
|
||||
{"-o", "--output", "--output-file"}, "FNAME",
|
||||
string_format("output file (default: '%s')",
|
||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||
? params.lora_outfile.c_str()
|
||||
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
||||
? params.cvector_outfile.c_str()
|
||||
: params.out_file.c_str()),
|
||||
string_format("output file (default: '%s')", params.out_file.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
params.cvector_outfile = value;
|
||||
params.lora_outfile = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -407,8 +407,6 @@ struct common_params {
|
||||
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||
|
||||
// imatrix params
|
||||
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
||||
|
||||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||
int32_t i_chunk = 0; // start processing from this chunk
|
||||
@@ -420,16 +418,16 @@ struct common_params {
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
// batched-bench params
|
||||
bool batched_bench_output_jsonl = false;
|
||||
|
||||
// common params
|
||||
std::string out_file; // output filename for all example programs
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
|
||||
@@ -197,29 +197,53 @@ The following compilation options are also available to tweak performance:
|
||||
|
||||
## MUSA
|
||||
|
||||
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
||||
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
|
||||
|
||||
- Using `CMake`:
|
||||
#### Download directly from Moore Threads
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON
|
||||
cmake --build build --config Release
|
||||
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
|
||||
|
||||
### Compilation
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
#### Override Compute Capability Specifications
|
||||
|
||||
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
||||
```
|
||||
|
||||
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
||||
|
||||
#### Compilation options
|
||||
|
||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||
|
||||
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
|
||||
```
|
||||
|
||||
For static build:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_MUSA=ON \
|
||||
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
|
||||
### Runtime MUSA environmental variables
|
||||
|
||||
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
|
||||
|
||||
```bash
|
||||
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
|
||||
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
|
||||
```
|
||||
|
||||
### Unified Memory
|
||||
|
||||
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
||||
|
||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||
|
||||
## HIP
|
||||
|
||||
This provides GPU acceleration on HIP-supported AMD GPUs.
|
||||
|
||||
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "control_vector.gguf";
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// write output vectors to gguf
|
||||
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||
export_gguf(ctx_train.v_final, params.out_file, model_hint);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_verbose = (params.verbosity > 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
||||
ctx.run_merge();
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s\n", err.what());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("done, output file is %s\n", params.lora_outfile.c_str());
|
||||
printf("done, output file is %s\n", params.out_file.c_str());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
|
||||
void IMatrixCollector::save_imatrix(int ncall) const {
|
||||
auto fname = m_params.out_file;
|
||||
if (fname.empty()) {
|
||||
fname = "imatrix.dat";
|
||||
}
|
||||
|
||||
if (ncall > 0) {
|
||||
fname += ".at_";
|
||||
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "imatrix.dat" ;
|
||||
|
||||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
params.escape = false;
|
||||
|
||||
@@ -4,31 +4,12 @@
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
//#include "ggml-cuda.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_SYCL
|
||||
//#include "ggml-sycl.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_METAL
|
||||
//#include "ggml-metal.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_CANN
|
||||
//#include "ggml-cann.h"
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_VULKAN
|
||||
//#include "ggml-vulkan.h"
|
||||
//#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
@@ -600,18 +581,54 @@ struct clip_ctx {
|
||||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
struct gguf_context * ctx_gguf = nullptr;
|
||||
struct ggml_context * ctx_data = nullptr;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_t buf = nullptr;
|
||||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
struct clip_image_size * load_image_size;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
backend = ctx_params.use_gpu
|
||||
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
|
||||
: nullptr;
|
||||
|
||||
if (backend) {
|
||||
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
backend_ptrs.push_back(backend);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||
} else {
|
||||
backend = backend_cpu;
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
backend_ptrs.push_back(backend_cpu);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
||||
|
||||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
||||
);
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_backend_buffer_free(buf);
|
||||
ggml_backend_free(backend);
|
||||
if (backend_cpu != backend) {
|
||||
ggml_backend_free(backend_cpu);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
@@ -1184,6 +1201,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return clip_init(fname, clip_context_params{
|
||||
/* use_gpu */ true,
|
||||
/* verbosity */ verbosity,
|
||||
});
|
||||
}
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
||||
int verbosity = ctx_params.verbosity;
|
||||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -1277,7 +1302,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx{};
|
||||
clip_ctx * new_clip = new clip_ctx(ctx_params);
|
||||
|
||||
// update projector type
|
||||
{
|
||||
@@ -1296,36 +1321,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
// new_clip->backend = ggml_backend_cuda_init(0);
|
||||
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_METAL
|
||||
// new_clip->backend = ggml_backend_metal_init();
|
||||
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_CANN
|
||||
// new_clip->backend = ggml_backend_cann_init(0);
|
||||
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_VULKAN
|
||||
// new_clip->backend = ggml_backend_vk_init(0);
|
||||
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
//#endif
|
||||
//
|
||||
//#ifdef GGML_USE_SYCL
|
||||
// new_clip->backend = ggml_backend_sycl_init(0);
|
||||
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
//#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
{
|
||||
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
||||
@@ -1421,7 +1416,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
|
||||
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
|
||||
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||
@@ -1434,7 +1431,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return nullptr;
|
||||
}
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
@@ -1720,14 +1717,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
// measure mem requirement and allocate
|
||||
{
|
||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
batch.data = nullptr;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
|
||||
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = new_clip->backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
|
||||
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
|
||||
if (size > 1) {
|
||||
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
@@ -2408,12 +2412,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -2609,8 +2607,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
|
||||
// set inputs
|
||||
const auto & model = ctx->vision_model;
|
||||
@@ -2775,11 +2774,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||
}
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
||||
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
|
||||
return false;
|
||||
}
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
int verbosity;
|
||||
};
|
||||
|
||||
// deprecated, use clip_init
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||
|
||||
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
struct clip_context_params clip_params = {
|
||||
/* use_gpu */ params->n_gpu_layers != 0,
|
||||
/* verbosity */ params->verbosity,
|
||||
};
|
||||
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
|
||||
@@ -2162,35 +2162,58 @@ struct server_context {
|
||||
|
||||
if (slot.has_new_line) {
|
||||
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
|
||||
if (slot.params.n_indent > 0) {
|
||||
if (slot.params.n_indent >= 0) {
|
||||
// check the current indentation
|
||||
// TODO: improve by not doing it more than once for each new line
|
||||
if (slot.last_nl_pos > 0) {
|
||||
size_t pos = slot.last_nl_pos;
|
||||
int n_indent = 0;
|
||||
|
||||
int n_indent = 0;
|
||||
while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
|
||||
n_indent++;
|
||||
pos++;
|
||||
size_t pos = slot.last_nl_pos;
|
||||
|
||||
while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t' || slot.generated_text[pos] == '\n')) {
|
||||
n_indent++;
|
||||
|
||||
if (slot.generated_text[pos] == '\n') {
|
||||
n_indent = 0;
|
||||
slot.last_nl_pos = pos + 1;
|
||||
}
|
||||
|
||||
if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
|
||||
pos++;
|
||||
}
|
||||
|
||||
if (0 < pos && pos < slot.generated_text.size()) {
|
||||
if (n_indent < slot.params.n_indent) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
slot.has_next_token = false;
|
||||
|
||||
// cut the last line
|
||||
slot.generated_text.erase(pos, std::string::npos);
|
||||
//slot.generated_text.erase(pos, std::string::npos);
|
||||
|
||||
SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
|
||||
}
|
||||
}
|
||||
|
||||
//SLT_ERR(slot, "n_indent = %d (%d), generated_text.size() = %d, n_decoded = %d, last_nl_pos = %d\n", n_indent, slot.params.n_indent, slot.generated_text.size(), slot.n_decoded, slot.last_nl_pos);
|
||||
|
||||
// find the next new line
|
||||
{
|
||||
const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
|
||||
size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
while (pos != std::string::npos) {
|
||||
slot.last_nl_pos = pos + 1;
|
||||
|
||||
// detect end of paragraph at current indent level
|
||||
if (slot.generated_text[slot.last_nl_pos - 2] == '\n' && n_indent <= slot.params.n_indent) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
slot.has_next_token = false;
|
||||
|
||||
// cut the last line
|
||||
slot.generated_text.erase(pos, std::string::npos);
|
||||
|
||||
SLT_DBG(slot, "stopped by reaching end of paragraph, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
pos = slot.generated_text.find('\n', slot.last_nl_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,6 +195,8 @@ option(GGML_OPENCL "ggml: use OpenCL"
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
||||
"gmml: OpenCL API version to target")
|
||||
|
||||
# toolchain for vulkan-shaders-gen
|
||||
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
||||
|
||||
@@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
|
||||
static struct ggml_backend_metal_device_context {
|
||||
id<MTLDevice> mtl_device;
|
||||
int mtl_device_ref_count;
|
||||
id<MTLLibrary> mtl_library;
|
||||
|
||||
bool has_simdgroup_reduction;
|
||||
bool has_simdgroup_mm;
|
||||
@@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
|
||||
} g_ggml_ctx_dev_main = {
|
||||
/*.mtl_device =*/ nil,
|
||||
/*.mtl_device_ref_count =*/ 0,
|
||||
/*.mtl_library =*/ nil,
|
||||
/*.has_simdgroup_reduction =*/ false,
|
||||
/*.has_simdgroup_mm =*/ false,
|
||||
/*.has_residency_sets =*/ false,
|
||||
@@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
ctx->mtl_device_ref_count--;
|
||||
|
||||
if (ctx->mtl_device_ref_count == 0) {
|
||||
if (ctx->mtl_library) {
|
||||
[ctx->mtl_library release];
|
||||
ctx->mtl_library = nil;
|
||||
}
|
||||
|
||||
if (ctx->mtl_device) {
|
||||
[ctx->mtl_device release];
|
||||
ctx->mtl_device = nil;
|
||||
@@ -495,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||
return data;
|
||||
}
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
|
||||
id<MTLLibrary> metal_library = nil;
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||
}
|
||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
// Link to the resource could not be resolved.
|
||||
default_metallib_path = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
default_metallib_path = nil;
|
||||
}
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!metal_library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (use_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
return metal_library;
|
||||
}
|
||||
|
||||
static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
@@ -522,136 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
id<MTLLibrary> metal_library = nil;
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
{
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||
}
|
||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||
// Link to the resource could not be resolved.
|
||||
default_metallib_path = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
default_metallib_path = nil;
|
||||
}
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!metal_library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (ctx_dev->use_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
if (ctx_dev->mtl_library == nil) {
|
||||
ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
|
||||
}
|
||||
id<MTLLibrary> metal_library = ctx_dev->mtl_library;
|
||||
if (metal_library == nil) {
|
||||
GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// print MTL GPU family:
|
||||
@@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
[metal_function release]; \
|
||||
if (error) { \
|
||||
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
[metal_library release]; \
|
||||
return NULL; \
|
||||
} \
|
||||
} else { \
|
||||
@@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
|
||||
}
|
||||
|
||||
[metal_library release];
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ if (MUSAToolkit_FOUND)
|
||||
message(STATUS "MUSA Toolkit found")
|
||||
|
||||
if (NOT DEFINED MUSA_ARCHITECTURES)
|
||||
set(MUSA_ARCHITECTURES "21;22")
|
||||
set(MUSA_ARCHITECTURES "21;22;31")
|
||||
endif()
|
||||
message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
|
||||
endif ()
|
||||
|
||||
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
||||
add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
|
||||
|
||||
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#define CL_TARGET_OPENCL_VERSION 220
|
||||
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
|
||||
// suppress warnings in CL headers for GCC and Clang
|
||||
@@ -25,6 +25,8 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <charconv>
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
||||
X1E,
|
||||
};
|
||||
|
||||
struct ggml_cl_version {
|
||||
cl_uint major = 0;
|
||||
cl_uint minor = 0;
|
||||
};
|
||||
|
||||
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version parse_cl_version(std::string_view str) {
|
||||
size_t major_str_begin = 0;
|
||||
size_t major_str_end = str.find(".", major_str_begin);
|
||||
if (major_str_end == std::string::npos) {
|
||||
return {};
|
||||
}
|
||||
|
||||
size_t minor_str_begin = major_str_end + 1;
|
||||
size_t minor_str_end = str.find(" ", minor_str_begin);
|
||||
if (minor_str_end == std::string::npos) {
|
||||
return {};
|
||||
}
|
||||
|
||||
cl_uint version_major;
|
||||
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
||||
return {};
|
||||
}
|
||||
|
||||
cl_uint version_minor;
|
||||
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
||||
return {};
|
||||
}
|
||||
return { version_major, version_minor };
|
||||
}
|
||||
|
||||
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
||||
size_t param_size;
|
||||
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
||||
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
||||
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
||||
|
||||
auto param_value = std::string_view(param_storage.get(), param_size);
|
||||
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
||||
if (param_value.find(version_prefix) != 0) {
|
||||
return {};
|
||||
}
|
||||
param_value.remove_prefix(version_prefix.length());
|
||||
return parse_cl_version(param_value);
|
||||
}
|
||||
|
||||
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
||||
size_t param_size;
|
||||
|
||||
#if CL_TARGET_OPENCL_VERSION >= 300
|
||||
if (platform_version.major >= 3) {
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
||||
if (!param_size) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
||||
unsigned versions_count = param_size / sizeof(cl_name_version);
|
||||
|
||||
cl_version version_max = 0;
|
||||
for (unsigned i = 0; i < versions_count; i++) {
|
||||
version_max = std::max<cl_version>(versions[i].version, version_max);
|
||||
}
|
||||
|
||||
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(platform_version);
|
||||
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
||||
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
||||
if (!param_size) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
||||
auto param_value = std::string_view(param_storage.get(), param_size);
|
||||
|
||||
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
||||
if (param_value.find(version_prefix) != 0) {
|
||||
return {};
|
||||
}
|
||||
param_value.remove_prefix(version_prefix.length());
|
||||
|
||||
return parse_cl_version(param_value);
|
||||
}
|
||||
|
||||
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
||||
if (strstr(device_name, "730") ||
|
||||
strstr(device_name, "740") ||
|
||||
@@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// A local ref of cl_device_id for convenience
|
||||
cl_device_id device = backend_ctx->device;
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
size_t device_ver_str_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
|
||||
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
||||
device_ver_buffer[device_ver_str_size] = '\0';
|
||||
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
||||
|
||||
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
||||
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return backend_ctx;
|
||||
}
|
||||
@@ -516,8 +604,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||
if (strstr(device_ver_buffer, "OpenCL 3") &&
|
||||
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
@@ -581,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
const std::string kernel_src = read_file("ggml-opencl.cl");
|
||||
#endif
|
||||
|
||||
std::string compile_opts =
|
||||
"-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
|
||||
"-cl-finite-math-only -cl-fast-relaxed-math ";
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
||||
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
// Non matmul kernels.
|
||||
@@ -693,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
||||
|
||||
// Gemv general
|
||||
std::string CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -713,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 2048, 16384
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -735,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 2048, 16384
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=2048 "
|
||||
" -DBLOCK_STRIDE_A=16384 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -750,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 5504, 44032
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=5504 "
|
||||
" -DBLOCK_STRIDE_A=44032 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=5504 "
|
||||
" -DBLOCK_STRIDE_A=44032 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
@@ -765,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
||||
|
||||
// Gemv 16000, 128000
|
||||
CL_gemv_compile_opts =
|
||||
" -cl-std=CL2.0 "
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=16000 "
|
||||
" -DBLOCK_STRIDE_A=128000 "
|
||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
||||
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DLINE_STRIDE_A=16000 "
|
||||
" -DBLOCK_STRIDE_A=128000 "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
|
||||
@@ -5,23 +5,24 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache1[2][BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache2[2][BLOCK_SIZE/16][16];
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
uint csel = 0;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
csel ^= 1;
|
||||
|
||||
barrier();
|
||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||
if (i < num_blocks_per_row) {
|
||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
}
|
||||
barrier();
|
||||
|
||||
@@ -29,8 +30,8 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
continue;
|
||||
} else {
|
||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -57,22 +58,22 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[csel][ix][ 8*v_im] * qs_u32_0[l ],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache1[csel][ix][1 + 8*v_im] * qs_u32_0[l+2],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache1[csel][ix][2 + 8*v_im] * qs_u32_2[l ],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache1[csel][ix][3 + 8*v_im] * qs_u32_2[l+2],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache1[csel][ix][4 + 8*v_im] * qs_u32_4[l ],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache1[csel][ix][5 + 8*v_im] * qs_u32_4[l+2],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache1[csel][ix][6 + 8*v_im] * qs_u32_6[l ],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache1[csel][ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[csel][ix][ 8*v_im],
|
||||
fma(FLOAT_TYPE(b16[l]), sccache2[csel][ix][1 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b32[l]), sccache2[csel][ix][2 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b48[l]), sccache2[csel][ix][3 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b64[l]), sccache2[csel][ix][4 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b80[l]), sccache2[csel][ix][5 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b96[l]), sccache2[csel][ix][6 + 8*v_im],
|
||||
fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2))))))));
|
||||
}
|
||||
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
|
||||
}
|
||||
|
||||
@@ -5,20 +5,21 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
|
||||
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][2][8];
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
uint csel = 0;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
csel ^= 1;
|
||||
|
||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||
barrier();
|
||||
if (i < num_blocks_per_row)
|
||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
barrier();
|
||||
|
||||
if (i >= num_blocks_per_row)
|
||||
@@ -40,8 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
|
||||
|
||||
if (all_threads) {
|
||||
barrier();
|
||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -59,14 +59,14 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
|
||||
fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
|
||||
fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
|
||||
fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
|
||||
fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
|
||||
fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
|
||||
fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
|
||||
fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
|
||||
sum = fma(FLOAT_TYPE( b0[l]) * sccache[csel][ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
|
||||
fma(FLOAT_TYPE( b16[l]) * sccache[csel][ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
|
||||
fma(FLOAT_TYPE( b32[l]) * sccache[csel][ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
|
||||
fma(FLOAT_TYPE( b48[l]) * sccache[csel][ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
|
||||
fma(FLOAT_TYPE( b64[l]) * sccache[csel][ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
|
||||
fma(FLOAT_TYPE( b80[l]) * sccache[csel][ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
|
||||
fma(FLOAT_TYPE( b96[l]) * sccache[csel][ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
|
||||
fma(FLOAT_TYPE(b112[l]) * sccache[csel][ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
|
||||
}
|
||||
temp[j][n] = fma(d, sum, temp[j][n]);
|
||||
}
|
||||
|
||||
@@ -6,20 +6,21 @@
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
|
||||
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][16];
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
uint csel = 0;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
csel ^= 1;
|
||||
|
||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||
barrier();
|
||||
if (i < num_blocks_per_row)
|
||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
barrier();
|
||||
|
||||
if (i >= num_blocks_per_row)
|
||||
@@ -51,8 +52,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
|
||||
|
||||
if (all_threads) {
|
||||
barrier();
|
||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
|
||||
sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
|
||||
}
|
||||
temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
|
||||
temp[j][n] = fma(fma(sum[0], sccache[csel][ix][s_offset], fma(sum[1], sccache[csel][ix][s_offset + 2], fma(sum[2], sccache[csel][ix][s_offset + 4], sum[3] * sccache[csel][ix][s_offset + 6]))), d, temp[j][n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,13 +120,7 @@ int main(int argc, char * argv[]) {
|
||||
generate_data(0.0, test_data.size(), test_data.data());
|
||||
generate_data(1.0, test_data2.size(), test_data2.data());
|
||||
|
||||
// Initialize GGML, ensures float conversion tables are initialized
|
||||
struct ggml_init_params ggml_params = {
|
||||
/* .mem_size = */ 1*1024,
|
||||
/* .mem_buffer = */ NULL,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
ggml_cpu_init();
|
||||
|
||||
int num_failed = 0;
|
||||
bool failed = false;
|
||||
@@ -188,7 +182,5 @@ int main(int argc, char * argv[]) {
|
||||
printf("%d tests failed\n", num_failed);
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
|
||||
return num_failed > 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user