mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 01:24:24 +00:00
Compare commits
25 Commits
b4424
...
gguf-v0.14
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a1d9c25fa | ||
|
|
1bf839b1e8 | ||
|
|
f7cd13301c | ||
|
|
4d2b3d8804 | ||
|
|
c07d437bbd | ||
|
|
99a3755a3c | ||
|
|
c792dcf488 | ||
|
|
80ccf5d725 | ||
|
|
a3c1232c3f | ||
|
|
8cef75c743 | ||
|
|
0d52a69e4b | ||
|
|
02f0430141 | ||
|
|
bec2183f2c | ||
|
|
53ff6b9b9f | ||
|
|
017cc5f446 | ||
|
|
a3d50bc022 | ||
|
|
a4dd490069 | ||
|
|
c0d6f790d0 | ||
|
|
dc7cef9f37 | ||
|
|
ecebbd292d | ||
|
|
96be8c3264 | ||
|
|
e6e7c75d94 | ||
|
|
09186fabbe | ||
|
|
96a1dc27c3 | ||
|
|
6369f867a4 |
12
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
12
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
@@ -65,12 +65,22 @@ body:
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: command
|
||||
attributes:
|
||||
label: Compile command
|
||||
description: >
|
||||
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
Please copy and paste any relevant log output, including any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
|
||||
12
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
12
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@@ -52,6 +52,16 @@ body:
|
||||
- Other (Please specify in the next section)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: command
|
||||
attributes:
|
||||
label: Command line
|
||||
description: >
|
||||
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
@@ -74,7 +84,7 @@ body:
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
|
||||
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
@@ -665,7 +665,7 @@ jobs:
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
|
||||
@@ -1237,7 +1237,7 @@ jobs:
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: anzz1/action-create-release@v1
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
|
||||
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
@@ -97,10 +97,9 @@ jobs:
|
||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
if: ${{ matrix.config.free_disk_space == true }}
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# if set to "true" but frees about 6 GB
|
||||
|
||||
4
.github/workflows/editorconfig.yml
vendored
4
.github/workflows/editorconfig.yml
vendored
@@ -23,5 +23,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||
with:
|
||||
version: v3.0.3
|
||||
- run: editorconfig-checker
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||
|
||||
/ci/ @ggerganov
|
||||
/.devops/ @ngxson
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/examples/server/ @ngxson
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/gguf.cpp @JohannesGaessler
|
||||
|
||||
@@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
|
||||
return *this;
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
|
||||
this->excludes = std::move(excludes);
|
||||
return *this;
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_env(const char * env) {
|
||||
help = help + "\n(env: " + env + ")";
|
||||
this->env = env;
|
||||
@@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) {
|
||||
return examples.find(ex) != examples.end();
|
||||
}
|
||||
|
||||
bool common_arg::is_exclude(enum llama_example ex) {
|
||||
return excludes.find(ex) != excludes.end();
|
||||
}
|
||||
|
||||
bool common_arg::get_value_from_env(std::string & output) {
|
||||
if (env == nullptr) return false;
|
||||
char * value = std::getenv(env);
|
||||
@@ -420,7 +429,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
||||
*/
|
||||
auto add_opt = [&](common_arg arg) {
|
||||
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
||||
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
|
||||
ctx_arg.options.push_back(std::move(arg));
|
||||
}
|
||||
};
|
||||
@@ -649,7 +658,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.prompt = value;
|
||||
}
|
||||
));
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--no-perf"},
|
||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
@@ -673,7 +682,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
}
|
||||
));
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--in-file"}, "FNAME",
|
||||
"an input file (repeat to specify multiple files)",
|
||||
@@ -700,7 +709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.prompt = ss.str();
|
||||
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
||||
}
|
||||
));
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-e", "--escape"},
|
||||
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
struct common_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::set<enum llama_example> excludes = {};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
@@ -53,9 +54,11 @@ struct common_arg {
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
|
||||
@@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
||||
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||
return base_name
|
||||
|
||||
|
||||
@@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--base", type=Path,
|
||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-model-id", type=str,
|
||||
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||
@@ -290,6 +297,7 @@ if __name__ == '__main__':
|
||||
|
||||
dir_base_model: Path | None = args.base
|
||||
dir_lora: Path = args.lora_path
|
||||
base_model_id: str | None = args.base_model_id
|
||||
lora_config = dir_lora / "adapter_config.json"
|
||||
input_model = dir_lora / "adapter_model.safetensors"
|
||||
|
||||
@@ -313,7 +321,10 @@ if __name__ == '__main__':
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
# load base model
|
||||
if dir_base_model is None:
|
||||
if base_model_id is not None:
|
||||
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||
hparams = load_hparams_from_hf(base_model_id)
|
||||
elif dir_base_model is None:
|
||||
if "base_model_name_or_path" in lparams:
|
||||
model_id = lparams["base_model_name_or_path"]
|
||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||
@@ -371,11 +382,16 @@ if __name__ == '__main__':
|
||||
if self.lazy:
|
||||
tensor = LazyTorchTensor.from_eager(tensor)
|
||||
base_name = get_base_tensor_name(name)
|
||||
is_lora_a = ".lora_A.weight" in name
|
||||
is_lora_b = ".lora_B.weight" in name
|
||||
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||
if not is_lora_a and not is_lora_b:
|
||||
if ".base_layer.weight" in name:
|
||||
continue
|
||||
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
|
||||
if "_layernorm" in name or ".norm" in name:
|
||||
yield (base_name, tensor)
|
||||
continue
|
||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||
@@ -407,9 +423,21 @@ if __name__ == '__main__':
|
||||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
# mergekit-extract-lora add these layernorm to the adapter
|
||||
if "_norm" in dest_name:
|
||||
assert dest_data.dim() == 1
|
||||
yield (dest_name, dest_data)
|
||||
continue
|
||||
|
||||
# otherwise, we must get the lora_A and lora_B tensors
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
||||
# note: mergekit-extract-lora flip and transpose A and B
|
||||
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
|
||||
if "token_embd.weight" in dest_name:
|
||||
lora_a = lora_a.T
|
||||
|
||||
yield (dest_name + ".lora_a", lora_a)
|
||||
yield (dest_name + ".lora_b", lora_b)
|
||||
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "pca.hpp"
|
||||
#include "mean.hpp"
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <cstdlib> /* abort() */
|
||||
#include <cstddef>
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <stdexcept>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <climits>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
@@ -296,7 +298,7 @@ struct split_strategy {
|
||||
total_size += ggml_nbytes(t);
|
||||
}
|
||||
total_size = total_size / 1000 / 1000; // convert to megabytes
|
||||
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||
printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||
i_split++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#undef MIN
|
||||
@@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name (ctx, i);
|
||||
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name (ctx, i);
|
||||
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
|
||||
printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
|
||||
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
||||
|
||||
// print first 10 elements
|
||||
const float * data = (const float *) cur->data;
|
||||
@@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||
const float * data = (const float *) cur->data;
|
||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||
if (data[j] != 100 + i) {
|
||||
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
||||
fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
|
||||
gguf_free(ctx);
|
||||
return false;
|
||||
}
|
||||
@@ -245,6 +247,8 @@ int main(int argc, char ** argv) {
|
||||
check_data = false;
|
||||
}
|
||||
|
||||
srand(123456);
|
||||
|
||||
const std::string fname(argv[1]);
|
||||
const std::string mode (argv[2]);
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
//#ifdef GGML_USE_CUDA
|
||||
//#include "ggml-cuda.h"
|
||||
@@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
@@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
total_size_org += orig_size;
|
||||
total_size_new += new_size;
|
||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
|
||||
fout.write((const char *)new_data, new_size);
|
||||
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
||||
for (size_t j = 0; j < pad; ++j) {
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
# include <curl/curl.h>
|
||||
#endif
|
||||
|
||||
#include <signal.h>
|
||||
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
@@ -25,6 +27,13 @@
|
||||
#include "json.hpp"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
|
||||
[[noreturn]] static void sigint_handler(int) {
|
||||
printf("\n");
|
||||
exit(0); // not ideal, but it's the only way to guarantee exit in all cases
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_ATTRIBUTE_FORMAT(1, 2)
|
||||
static std::string fmt(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
@@ -83,6 +92,7 @@ class Opt {
|
||||
}
|
||||
|
||||
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||
ctx_params.n_ctx = ctx_params.n_batch;
|
||||
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||
|
||||
@@ -800,7 +810,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
||||
|
||||
static int read_user_input(std::string & user) {
|
||||
std::getline(std::cin, user);
|
||||
return user.empty(); // Should have data in happy path
|
||||
if (std::cin.eof()) {
|
||||
printf("\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (user == "/bye") {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (user.empty()) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0; // Should have data in happy path
|
||||
}
|
||||
|
||||
// Function to generate a response based on the prompt
|
||||
@@ -867,7 +890,25 @@ static bool is_stdout_a_terminal() {
|
||||
#endif
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
// Function to handle user input
|
||||
static int get_user_input(std::string & user_input, const std::string & user) {
|
||||
while (true) {
|
||||
const int ret = handle_user_input(user_input, user);
|
||||
if (ret == 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (ret == 2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Main chat loop function
|
||||
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||
int prev_len = 0;
|
||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||
@@ -875,7 +916,8 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||
while (true) {
|
||||
// Get user input
|
||||
std::string user_input;
|
||||
while (handle_user_input(user_input, user)) {
|
||||
if (get_user_input(user_input, user) == 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||
@@ -916,7 +958,23 @@ static std::string read_pipe_data() {
|
||||
return result.str();
|
||||
}
|
||||
|
||||
static void ctrl_c_handling() {
|
||||
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset(&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined(_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
ctrl_c_handling();
|
||||
Opt opt;
|
||||
const int ret = opt.init(argc, argv);
|
||||
if (ret == 2) {
|
||||
|
||||
@@ -45,10 +45,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
| `--no-escape` | do not process escape sequences |
|
||||
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
||||
|
||||
@@ -3797,7 +3797,7 @@ int main(int argc, char ** argv) {
|
||||
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
||||
|
||||
std::string prompt = json_value(data, "prompt", std::string());
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
|
||||
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
||||
data["prompt"] = format_infill(
|
||||
ctx_server.ctx,
|
||||
|
||||
@@ -18,7 +18,7 @@ def test_infill_without_input_extra():
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Ann|small|shiny)+", res.body["content"])
|
||||
assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
|
||||
|
||||
|
||||
def test_infill_with_input_extra():
|
||||
|
||||
@@ -243,7 +243,8 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
include/ggml-sycl.h
|
||||
include/ggml-vulkan.h)
|
||||
include/ggml-vulkan.h
|
||||
include/gguf.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
#if (GGML_METAL)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
#include <memory>
|
||||
|
||||
// Smart pointers for ggml types
|
||||
|
||||
@@ -241,12 +241,6 @@
|
||||
#define GGML_ROPE_TYPE_MROPE 8
|
||||
#define GGML_ROPE_TYPE_VISION 24
|
||||
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
|
||||
#define GGUF_VERSION 3
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||
@@ -403,12 +397,6 @@ extern "C" {
|
||||
GGML_PREC_F32,
|
||||
};
|
||||
|
||||
enum ggml_backend_type {
|
||||
GGML_BACKEND_TYPE_CPU = 0,
|
||||
GGML_BACKEND_TYPE_GPU = 10,
|
||||
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum ggml_ftype {
|
||||
GGML_FTYPE_UNKNOWN = -1,
|
||||
@@ -587,8 +575,6 @@ extern "C" {
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
@@ -2111,132 +2097,6 @@ extern "C" {
|
||||
int64_t n_per_row,
|
||||
const float * imatrix);
|
||||
|
||||
//
|
||||
// gguf
|
||||
//
|
||||
|
||||
enum gguf_type {
|
||||
GGUF_TYPE_UINT8 = 0,
|
||||
GGUF_TYPE_INT8 = 1,
|
||||
GGUF_TYPE_UINT16 = 2,
|
||||
GGUF_TYPE_INT16 = 3,
|
||||
GGUF_TYPE_UINT32 = 4,
|
||||
GGUF_TYPE_INT32 = 5,
|
||||
GGUF_TYPE_FLOAT32 = 6,
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_UINT64 = 10,
|
||||
GGUF_TYPE_INT64 = 11,
|
||||
GGUF_TYPE_FLOAT64 = 12,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
struct gguf_context;
|
||||
|
||||
struct gguf_init_params {
|
||||
bool no_alloc;
|
||||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
||||
|
||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
||||
|
||||
// will abort if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
||||
|
||||
// removes key if it exists
|
||||
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
||||
|
||||
// set or add KV pairs from another context
|
||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
||||
|
||||
// manage tensor info
|
||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
||||
|
||||
// writing gguf files can be done in 2 ways:
|
||||
//
|
||||
// - write the entire gguf_context to a binary file in a single pass:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname);
|
||||
//
|
||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||
//
|
||||
// FILE * f = fopen(fname, "wb");
|
||||
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
||||
// fwrite(f, ...);
|
||||
// void * data = gguf_meta_get_meta_data(ctx);
|
||||
// fseek(f, 0, SEEK_SET);
|
||||
// fwrite(f, data, gguf_get_meta_size(ctx));
|
||||
// free(data);
|
||||
// fclose(f);
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
# if defined(__GNUC__)
|
||||
|
||||
202
ggml/include/gguf.h
Normal file
202
ggml/include/gguf.h
Normal file
@@ -0,0 +1,202 @@
|
||||
// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
|
||||
// GGUF files have the following structure:
|
||||
//
|
||||
// 1. File magic "GGUF" (4 bytes).
|
||||
// 2. File version (uint32_t).
|
||||
// 3. Number of ggml tensors in file (int64_t).
|
||||
// 4. Number of key-value-pairs in file (int64_t).
|
||||
// 5. For each KV pair:
|
||||
// 1. The key (string).
|
||||
// 2. The value type (gguf_type).
|
||||
// 3a. If the value type is GGUF_TYPE_ARRAY:
|
||||
// 1. The type of the array (gguf_type).
|
||||
// 2. The number of elements in the array (uint64_t).
|
||||
// 3. The binary representation of each element in the array.
|
||||
// 3b. Otherwise:
|
||||
// 1. The binary representation of the value.
|
||||
// 6. For each ggml tensor:
|
||||
// 1. The tensor name (string).
|
||||
// 2. The number of dimensions of the tensor (uint32_t).
|
||||
// 3. For each dimension:
|
||||
// 1. The size of the tensor in the dimension (int64_t).
|
||||
// 4. The tensor data type (ggml_type).
|
||||
// 5. The tensor data offset in the tensor data binary blob (uint64_t).
|
||||
// 7. The tensor data binary blob (optional, aligned).
|
||||
//
|
||||
// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
|
||||
// All enums are stored as int32_t.
|
||||
// All bool values are stored as int8_t.
|
||||
// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
|
||||
// otherwise GGUF_DEFAULT_ALIGNMENT is used.
|
||||
//
|
||||
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
#define GGUF_VERSION 3
|
||||
|
||||
#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// types that can be stored as GGUF KV data
|
||||
enum gguf_type {
|
||||
GGUF_TYPE_UINT8 = 0,
|
||||
GGUF_TYPE_INT8 = 1,
|
||||
GGUF_TYPE_UINT16 = 2,
|
||||
GGUF_TYPE_INT16 = 3,
|
||||
GGUF_TYPE_UINT32 = 4,
|
||||
GGUF_TYPE_INT32 = 5,
|
||||
GGUF_TYPE_FLOAT32 = 6,
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_UINT64 = 10,
|
||||
GGUF_TYPE_INT64 = 11,
|
||||
GGUF_TYPE_FLOAT64 = 12,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
struct gguf_context;
|
||||
|
||||
struct gguf_init_params {
|
||||
bool no_alloc;
|
||||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
|
||||
GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
|
||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
// will abort if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
// get raw pointer to the first element of the array with the given key_id
|
||||
// for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
// get ith C string from array with given key_id
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||
|
||||
GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
|
||||
GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id);
|
||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id);
|
||||
GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id);
|
||||
|
||||
// removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
|
||||
GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||
|
||||
// overrides an existing KV pair or adds a new one, the new KV pair is always at the back
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
|
||||
// creates a new array with n elements of the given type and copies the corresponding number of bytes from data
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
|
||||
|
||||
// creates a new array with n strings and copies the corresponding strings from data
|
||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
|
||||
|
||||
// set or add KV pairs from another context
|
||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
|
||||
|
||||
// add tensor to GGUF context, tensor name must be unique
|
||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||
|
||||
// after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
|
||||
// in such a way that the tensor data remains as one contiguous block (except for padding)
|
||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||
|
||||
// assumes that at least gguf_get_tensor_size bytes can be read from data
|
||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
|
||||
|
||||
// writing gguf files can be done in 3 ways:
|
||||
//
|
||||
// - write the entire gguf_context to a binary file in a single pass:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
|
||||
//
|
||||
// - write only the meta data to a file, then re-open the file and append the tensor data:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
|
||||
// FILE * f = fopen(fname, "ab");
|
||||
// fwrite(f, ...); // write tensor data
|
||||
// fclose(f);
|
||||
//
|
||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||
//
|
||||
// FILE * f = fopen(fname, "wb");
|
||||
// const size_t size_meta = gguf_get_meta_size(ctx);
|
||||
// fseek(f, size_meta, SEEK_SET);
|
||||
// fwrite(f, ...); // write tensor data
|
||||
// void * data = malloc(size_meta);
|
||||
// gguf_get_meta_data(ctx, data);
|
||||
// rewind(f);
|
||||
// fwrite(data, 1, data, f);
|
||||
// free(data);
|
||||
// fclose(f);
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
|
||||
// writes the meta data to pointer "data"
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -208,6 +208,7 @@ add_library(ggml-base
|
||||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
../include/ggml-opt.h
|
||||
../include/gguf.h
|
||||
ggml.c
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
@@ -215,7 +216,8 @@ add_library(ggml-base
|
||||
ggml-threading.cpp
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h)
|
||||
ggml-quants.h
|
||||
gguf.cpp)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
|
||||
|
||||
@@ -574,4 +574,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
ggml_backend_load_best("opencl", silent, dir_path);
|
||||
ggml_backend_load_best("musa", silent, dir_path);
|
||||
ggml_backend_load_best("cpu", silent, dir_path);
|
||||
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||
if (backend_path) {
|
||||
ggml_backend_load(backend_path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -764,7 +764,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
if (src_backend_id == sched->n_backends - 1) {
|
||||
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
|
||||
@@ -4169,6 +4169,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g
|
||||
buffer->buft = buft;
|
||||
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
||||
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
||||
buffer->iface.get_tensor = nullptr;
|
||||
buffer->iface.cpy_tensor = nullptr;
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
@@ -1051,6 +1052,704 @@ class tinyBLAS_Q0_AVX {
|
||||
} \
|
||||
} \
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
tinyBLAS_Q0_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int size>
|
||||
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
||||
vector signed int vec_C[4];
|
||||
vector float CA[4] = {0};
|
||||
vector float res[4] = {0};
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VA, typename VB>
|
||||
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
|
||||
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
|
||||
VB t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
vector unsigned char xor_vector;
|
||||
uint8_t flip_vec = 0x80;
|
||||
xor_vector = vec_splats(flip_vec);
|
||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||
C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
|
||||
C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
|
||||
C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
|
||||
C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
|
||||
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||
__builtin_vsx_disassemble_pair(c5, &C5);
|
||||
__builtin_vsx_disassemble_pair(c6, &C6);
|
||||
__builtin_vsx_disassemble_pair(c7, &C7);
|
||||
__builtin_vsx_disassemble_pair(c8, &C8);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
t1 = vec_perm(c5[0], c6[0], swiz1);
|
||||
t2 = vec_perm(c5[0], c6[0], swiz2);
|
||||
t3 = vec_perm(c7[0], c8[0], swiz1);
|
||||
t4 = vec_perm(c7[0], c8[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+128);
|
||||
vec_xst(t6, 0, vecOffset+144);
|
||||
vec_xst(t7, 0, vecOffset+160);
|
||||
vec_xst(t8, 0, vecOffset+176);
|
||||
|
||||
t1 = vec_perm(c5[1], c6[1], swiz1);
|
||||
t2 = vec_perm(c5[1], c6[1], swiz2);
|
||||
t3 = vec_perm(c7[1], c8[1], swiz1);
|
||||
t4 = vec_perm(c7[1], c8[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+192);
|
||||
vec_xst(t6, 0, vecOffset+208);
|
||||
vec_xst(t7, 0, vecOffset+224);
|
||||
vec_xst(t8, 0, vecOffset+240);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
aoffset4 += lda;
|
||||
aoffset5 += lda;
|
||||
aoffset6 += lda;
|
||||
aoffset7 += lda;
|
||||
aoffset8 += lda;
|
||||
vecOffset += 256;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
j--;
|
||||
} while(j > 0);
|
||||
}
|
||||
|
||||
if (rows & 4) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
aoffset4 += lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
if (rows & 3) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
break;
|
||||
}
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
int m_rem = MIN(m - m0, 8);
|
||||
int n_rem = MIN(n - n0, 8);
|
||||
// TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
|
||||
// issues. After resolving them, below code will be enabled.
|
||||
/*if (m_rem >= 16 && n_rem >= 8) {
|
||||
mc = 16;
|
||||
nc = 8;
|
||||
gemm<16,8>(m0, m, n0, n);
|
||||
} else if(m_rem >= 8 && n_rem >= 16) {
|
||||
mc = 8;
|
||||
nc = 16;
|
||||
gemm<8,16>(m0, m, n0, n);
|
||||
}*/
|
||||
if (m_rem >= 8 && n_rem >= 8) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||
mc = 4;
|
||||
nc = 8;
|
||||
gemm<4,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 8 && n_rem >= 4) {
|
||||
mc = 8;
|
||||
nc = 4;
|
||||
gemm<8,4>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm_small<4, 4>(m0, m, n0, n);
|
||||
} else if ((m_rem < 4) && (n_rem > 4)) {
|
||||
nc = 4;
|
||||
switch(m_rem) {
|
||||
case 1:
|
||||
mc = 1;
|
||||
gemm_small<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
mc = 2;
|
||||
gemm_small<2, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
mc = 3;
|
||||
gemm_small<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||
mc = 4;
|
||||
switch(n_rem) {
|
||||
case 1:
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
switch((m_rem << 4) | n_rem) {
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm_small<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm_small<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm_small<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm_small<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm_small<2, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm_small<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm_small<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm_small<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm_small<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm_small<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm_small<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm_small<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 4> comparray;
|
||||
vector float fin_res[8] = {0};
|
||||
vector float vs[8] = {0};
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
|
||||
}
|
||||
for (int I = 0; I<4; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||
}
|
||||
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||
save_res<4, 4>(ii, jj+4, 4, fin_res);
|
||||
}
|
||||
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[8] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 8> comparray;
|
||||
vector float fin_res[8] = {0};
|
||||
vector float vs[8] = {0};
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||
}
|
||||
for (int I = 0; I<8; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
}
|
||||
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||
}
|
||||
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
std::array<int, 8> comparray;
|
||||
vector float fin_res[16] = {0};
|
||||
vector float vs[16] = {0};
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
|
||||
}
|
||||
for (int I = 0; I<8; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
|
||||
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
|
||||
}
|
||||
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||
save_res<4, 4>(ii, jj+4, 8, fin_res);
|
||||
save_res<4, 4>(ii+4, jj+4, 12, fin_res);
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
vec_t vec_A[8], vec_B[8] = {0};
|
||||
vector signed int vec_C[4];
|
||||
acc_t acc_0;
|
||||
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
std::array<int, RM> comparray;
|
||||
vector float res[4] = {0};
|
||||
vector float fin_res[4] = {0};
|
||||
vector float vs[4] = {0};
|
||||
vector float CA[4] = {0};
|
||||
__builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
|
||||
__builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x+=4) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
|
||||
}
|
||||
for (int I = 0; I<RM; I++) {
|
||||
for (int J = 0; J<RN; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < RM; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
|
||||
for (int i = 0; i < RM; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
|
||||
}
|
||||
}
|
||||
save_res<RM, RN>(ii, jj, 0, fin_res);
|
||||
}
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else {
|
||||
static_assert(false, "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
kernel<RM, RN>(ii, jj);
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *C;
|
||||
TA *At;
|
||||
TB *Bt;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_PPC {
|
||||
public:
|
||||
@@ -1070,13 +1769,17 @@ class tinyBLAS_PPC {
|
||||
|
||||
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
||||
|
||||
void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
|
||||
template<typename VA>
|
||||
void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
|
||||
int64_t i, j;
|
||||
float *aoffset = NULL, *boffset = NULL;
|
||||
float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
|
||||
aoffset = const_cast<float*>(a);
|
||||
TA *aoffset = NULL, *boffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||
VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||
VA t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
aoffset = const_cast<TA*>(a);
|
||||
boffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
@@ -1092,9 +1795,6 @@ class tinyBLAS_PPC {
|
||||
aoffset += 8 * lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||
@@ -1174,21 +1874,19 @@ class tinyBLAS_PPC {
|
||||
} while(i > 0);
|
||||
}
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4, c5, c6, c7, c8;
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c4 = vec_xl(0, aoffset4);
|
||||
c5 = vec_xl(0, aoffset5);
|
||||
c6 = vec_xl(0, aoffset6);
|
||||
c7 = vec_xl(0, aoffset7);
|
||||
c8 = vec_xl(0, aoffset8);
|
||||
c1[0] = vec_xl(0, aoffset1);
|
||||
c2[0] = vec_xl(0, aoffset2);
|
||||
c3[0] = vec_xl(0, aoffset3);
|
||||
c4[0] = vec_xl(0, aoffset4);
|
||||
c5[0] = vec_xl(0, aoffset5);
|
||||
c6[0] = vec_xl(0, aoffset6);
|
||||
c7[0] = vec_xl(0, aoffset7);
|
||||
c8[0] = vec_xl(0, aoffset8);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t3 = vec_mergeh(c5, c6);
|
||||
t4 = vec_mergeh(c7, c8);
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_mergeh(c5[0], c6[0]);
|
||||
t4 = vec_mergeh(c7[0], c8[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
@@ -1198,10 +1896,10 @@ class tinyBLAS_PPC {
|
||||
vec_xst(t7, 0, boffset+8);
|
||||
vec_xst(t8, 0, boffset+12);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t3 = vec_mergel(c5, c6);
|
||||
t4 = vec_mergel(c7, c8);
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_mergel(c5[0], c6[0]);
|
||||
t4 = vec_mergel(c7[0], c8[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
@@ -1223,9 +1921,6 @@ class tinyBLAS_PPC {
|
||||
aoffset += 4 * lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
__vector_pair C1, C2, C3, C4;
|
||||
vector float c1[2], c2[2], c3[2], c4[2];
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||
@@ -1272,22 +1967,20 @@ class tinyBLAS_PPC {
|
||||
}
|
||||
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4;
|
||||
vector float t1, t2, t3, t4;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c4 = vec_xl(0, aoffset4);
|
||||
c1[0] = vec_xl(0, aoffset1);
|
||||
c2[0] = vec_xl(0, aoffset2);
|
||||
c3[0] = vec_xl(0, aoffset3);
|
||||
c4[0] = vec_xl(0, aoffset4);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset);
|
||||
vec_xst(t4, 0, boffset+4);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset+8);
|
||||
@@ -1299,21 +1992,19 @@ class tinyBLAS_PPC {
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4 = {0};
|
||||
vector float t1, t2, t3, t4;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c1[0] = vec_xl(0, aoffset1);
|
||||
c2[0] = vec_xl(0, aoffset2);
|
||||
c3[0] = vec_xl(0, aoffset3);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset);
|
||||
vec_xst(t4, 0, boffset+4);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset+8);
|
||||
@@ -1321,14 +2012,13 @@ class tinyBLAS_PPC {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[4], vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
for (int l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
||||
@@ -1343,8 +2033,8 @@ class tinyBLAS_PPC {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int64_t l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
||||
@@ -1364,8 +2054,8 @@ class tinyBLAS_PPC {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int64_t l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
||||
@@ -1387,8 +2077,8 @@ class tinyBLAS_PPC {
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
|
||||
for(int x = 0; x < 16; x+=2) {
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
||||
@@ -1571,15 +2261,15 @@ class tinyBLAS_PPC {
|
||||
vec_t vec_A[4], vec_B[4];
|
||||
for (int l=0; l<k; l+=4) {
|
||||
if (RN >= 4 && RM == 1) {
|
||||
float* a = const_cast<float*>(A+(ii)*lda+l);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
TA* a = const_cast<TA*>(A+(ii)*lda+l);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
|
||||
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
|
||||
} else {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||
}
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||
@@ -1589,7 +2279,7 @@ class tinyBLAS_PPC {
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
|
||||
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1812,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
|
||||
#elif defined(__MMA__)
|
||||
if (n < 8 && n != 4)
|
||||
return false;
|
||||
if (m < 8 && m != 4)
|
||||
return false;
|
||||
tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
// GGML internal header
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
@@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// expose GGUF internals for test code
|
||||
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
|
||||
struct gguf_buf {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
};
|
||||
GGML_API struct gguf_buf gguf_buf_init(size_t size);
|
||||
GGML_API void gguf_buf_free(struct gguf_buf buf);
|
||||
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <vector>
|
||||
|
||||
// expose GGUF internals for test code
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -27,15 +27,6 @@
|
||||
#endif
|
||||
#include <cstring>
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#define GGML_DEBUG 0
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef SOCKET sockfd_t;
|
||||
using ssize_t = __int64;
|
||||
@@ -411,7 +402,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
initialized = true;
|
||||
}
|
||||
#else
|
||||
UNUSED(initialized);
|
||||
GGML_UNUSED(initialized);
|
||||
#endif
|
||||
auto sock = socket_connect(host.c_str(), port);
|
||||
if (sock == nullptr) {
|
||||
@@ -640,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
GGML_UNUSED(backend);
|
||||
// this is no-op because we don't have any async operations
|
||||
}
|
||||
|
||||
@@ -850,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
|
||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||
buffers.insert(buffer);
|
||||
} else {
|
||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||
GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -872,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
return false;
|
||||
}
|
||||
void * base = ggml_backend_buffer_get_base(buffer);
|
||||
@@ -884,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_free(buffer);
|
||||
@@ -896,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buffer, request.value);
|
||||
@@ -952,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||
if (tensor == nullptr) {
|
||||
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
}
|
||||
@@ -1017,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||
if (tensor == nullptr) {
|
||||
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
}
|
||||
@@ -1051,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
||||
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
||||
if (src == nullptr || dst == nullptr) {
|
||||
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
}
|
||||
@@ -1385,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
|
||||
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
||||
|
||||
UNUSED(dev);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||
// TODO: obtain value from the server
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
|
||||
UNUSED(dev);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
@@ -1413,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
|
||||
|
||||
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
||||
|
||||
UNUSED(params);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -1421,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
||||
|
||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||
|
||||
UNUSED(dev);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
UNUSED(dev);
|
||||
UNUSED(op);
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(op);
|
||||
//TODO: call the remote backend and cache the results
|
||||
return true;
|
||||
}
|
||||
@@ -1463,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "RPC";
|
||||
|
||||
UNUSED(reg);
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
return 0;
|
||||
|
||||
UNUSED(reg);
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
||||
|
||||
UNUSED(reg);
|
||||
UNUSED(index);
|
||||
GGML_UNUSED(reg);
|
||||
GGML_UNUSED(index);
|
||||
}
|
||||
|
||||
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
@@ -1485,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
|
||||
}
|
||||
return NULL;
|
||||
|
||||
UNUSED(reg);
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
||||
|
||||
@@ -131,7 +131,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rwkv_wkv_f32_kernel(
|
||||
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
||||
item_ct1, shared_mem_acc.get_pointer()
|
||||
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -8,6 +8,20 @@ if (Vulkan_FOUND)
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
# Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
|
||||
message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
@@ -69,11 +83,15 @@ if (Vulkan_FOUND)
|
||||
|
||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
||||
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
set(_ggml_vk_genshaders_cmd "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}")
|
||||
endif ()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${_ggml_vk_header}
|
||||
${_ggml_vk_source}
|
||||
|
||||
COMMAND "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}"
|
||||
COMMAND ${_ggml_vk_genshaders_cmd}
|
||||
--glslc ${Vulkan_GLSLC_EXECUTABLE}
|
||||
--input-dir ${_ggml_vk_input_dir}
|
||||
--output-dir ${_ggml_vk_output_dir}
|
||||
|
||||
@@ -1645,6 +1645,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
#undef CREATE_MM2
|
||||
} else
|
||||
#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (device->coopmat_support) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
@@ -1739,7 +1740,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
}
|
||||
#undef CREATE_MM2
|
||||
#undef CREATE_MM
|
||||
} else if (device->fp16) {
|
||||
} else
|
||||
#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (device->fp16) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
if (device->mul_mat ## ID ## _l) \
|
||||
@@ -2242,6 +2245,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||
}
|
||||
|
||||
#if defined(VK_KHR_cooperative_matrix)
|
||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||
coopmat_features.pNext = nullptr;
|
||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||
@@ -2251,6 +2255,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
|
||||
last_struct = (VkBaseOutStructure *)&coopmat_features;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
|
||||
@@ -2283,7 +2288,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
||||
}
|
||||
|
||||
#if defined(VK_KHR_cooperative_matrix)
|
||||
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
||||
#endif
|
||||
|
||||
if (coopmat2_support) {
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
@@ -2376,6 +2383,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
|
||||
#if defined(VK_KHR_cooperative_matrix)
|
||||
if (device->coopmat_support) {
|
||||
// Query supported shapes
|
||||
std::vector<VkCooperativeMatrixPropertiesKHR> cm_props;
|
||||
@@ -2442,7 +2450,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
if (device->coopmat_support) {
|
||||
device_extensions.push_back("VK_KHR_cooperative_matrix");
|
||||
}
|
||||
|
||||
#endif
|
||||
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||
|
||||
device_create_info = {
|
||||
@@ -2553,9 +2561,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
fp16_storage = true;
|
||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||
fp16_compute = true;
|
||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||
coopmat_support = true;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
@@ -2593,6 +2603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
// Pointer to the last chain element
|
||||
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||
coopmat_features.pNext = nullptr;
|
||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||
@@ -2608,6 +2619,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
||||
#endif
|
||||
|
||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
#version 460
|
||||
|
||||
#extension GL_KHR_cooperative_matrix : require
|
||||
|
||||
void main()
|
||||
{
|
||||
}
|
||||
@@ -342,9 +342,11 @@ void process_shaders() {
|
||||
matmul_shaders(true, matmul_id, false, false, false);
|
||||
matmul_shaders(true, matmul_id, false, false, true);
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
// Coopmat, fp32acc and fp16acc
|
||||
matmul_shaders(true, matmul_id, true, false, false);
|
||||
matmul_shaders(true, matmul_id, true, false, true);
|
||||
#endif
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
// Coopmat2, fp32acc and fp16acc
|
||||
|
||||
1276
ggml/src/ggml.c
1276
ggml/src/ggml.c
File diff suppressed because it is too large
Load Diff
1325
ggml/src/gguf.cpp
Normal file
1325
ggml/src/gguf.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -15,13 +15,13 @@ pip install gguf
|
||||
|
||||
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||
|
||||
[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
|
||||
[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
|
||||
[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
|
||||
[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.13.0"
|
||||
version = "0.14.0"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
{include = "gguf"},
|
||||
{include = "gguf/py.typed"},
|
||||
{include = "scripts"},
|
||||
]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
@@ -33,7 +32,7 @@ requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint"
|
||||
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
||||
|
||||
@@ -1 +1 @@
|
||||
a2af72be7baf5b1f4a33d34e77e509e5e85b7cd7
|
||||
c8bd0fee71dc8328d93be301bbee06bc10d30429
|
||||
|
||||
@@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
} else {
|
||||
ab_map[name].b = cur;
|
||||
}
|
||||
} else if (str_endswith(name, "_norm.weight")) {
|
||||
// TODO: add support for norm vector
|
||||
// for now, we don't really care because most adapters still work fine without it
|
||||
continue;
|
||||
} else {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||
}
|
||||
@@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
for (auto & it : ab_map) {
|
||||
const std::string & name = it.first;
|
||||
llama_lora_weight & w = it.second;
|
||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||
|
||||
if (!w.a || !w.b) {
|
||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||
@@ -259,16 +264,23 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
||||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||
// validate tensor shape
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
if (is_token_embd) {
|
||||
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
||||
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
} else {
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
}
|
||||
}
|
||||
|
||||
// save tensor to adapter
|
||||
|
||||
@@ -45,6 +45,13 @@ struct llama_lora_weight {
|
||||
struct ggml_tensor * a = nullptr;
|
||||
struct ggml_tensor * b = nullptr;
|
||||
|
||||
// get actual scale based on rank and alpha
|
||||
float get_scale(float alpha, float adapter_scale) {
|
||||
const float rank = (float) b->ne[0];
|
||||
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||
return scale;
|
||||
}
|
||||
|
||||
llama_lora_weight() = default;
|
||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
};
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cinttypes>
|
||||
@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
|
||||
@@ -119,10 +119,10 @@ bool llama_kv_cache_init(
|
||||
|
||||
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
struct llama_kv_cache & cache,
|
||||
const struct llama_ubatch & batch) {
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
const uint32_t n_seqs = batch.n_seqs;
|
||||
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
||||
const struct llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// For recurrent state architectures (like Mamba or RWKV),
|
||||
@@ -130,16 +130,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
// A slot should be always be contiguous.
|
||||
|
||||
// can only process batches with an equal number of new tokens in each sequence
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
|
||||
int32_t min = cache.size - 1;
|
||||
int32_t max = 0;
|
||||
|
||||
// everything should fit if all seq_ids are smaller than the max
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const uint32_t n_seq_id = batch.n_seq_id[s];
|
||||
const uint32_t n_seq_id = ubatch.n_seq_id[s];
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
|
||||
// too big seq_id
|
||||
@@ -198,7 +198,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
|
||||
// find usable cell range
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][0];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
llama_kv_cell & seq_meta = cache.cells[seq_id];
|
||||
bool has_cell = false;
|
||||
if (seq_meta.tail >= 0) {
|
||||
@@ -237,7 +237,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
// gather and re-order
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
int32_t dst_id = s + min;
|
||||
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
|
||||
int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
|
||||
if (dst_id != src_id) {
|
||||
llama_kv_cell & dst_cell = cache.cells[dst_id];
|
||||
llama_kv_cell & src_cell = cache.cells[src_id];
|
||||
@@ -258,7 +258,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
|
||||
// update the pos of the used seqs
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
int32_t cell_id = s + min;
|
||||
llama_kv_cell & cell = cache.cells[cell_id];
|
||||
|
||||
@@ -266,12 +266,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
// What should happen when the pos backtracks or skips a value?
|
||||
// Clearing the state mid-batch would require special-casing which isn't done.
|
||||
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
|
||||
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
|
||||
__func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
|
||||
}
|
||||
cell.pos = last_pos;
|
||||
cell.seq_id.clear();
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||
cell.seq_id.insert(seq_id);
|
||||
cache.cells[seq_id].tail = cell_id;
|
||||
}
|
||||
@@ -325,10 +325,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
for (uint32_t s = 0; s < n_seqs; s++) {
|
||||
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
|
||||
uint32_t k = s*n_seq_tokens + i;
|
||||
cache.cells[cache.head + k].pos = batch.pos[k];
|
||||
cache.cells[cache.head + k].pos = ubatch.pos[k];
|
||||
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ const char * llama_file_version_name(llama_fver version) {
|
||||
}
|
||||
|
||||
namespace GGUFMeta {
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
||||
struct GKV_Base_Type {
|
||||
static constexpr gguf_type gt = gt_;
|
||||
|
||||
@@ -60,10 +60,11 @@ namespace GGUFMeta {
|
||||
public:
|
||||
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
||||
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
|
||||
return ArrayInfo {
|
||||
gguf_get_arr_type(ctx, k),
|
||||
arr_type,
|
||||
size_t(gguf_get_arr_n(ctx, k)),
|
||||
gguf_get_arr_data(ctx, k),
|
||||
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -553,7 +554,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||
|
||||
@@ -7,14 +7,12 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
// TODO: replace with ggml API call
|
||||
#define QK_K 256
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->output_tensor_type;
|
||||
} else {
|
||||
int nx = tensor->ne[0];
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||
//}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
||||
new_type == GGML_TYPE_IQ1_M) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
||||
{
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t ny = tensor->ne[1];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (nx % qk_k != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
||||
convert_incompatible_tensor = true;
|
||||
} else {
|
||||
++qs.n_k_quantized;
|
||||
}
|
||||
}
|
||||
|
||||
if (convert_incompatible_tensor) {
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_TQ1_0:
|
||||
@@ -875,7 +874,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "llama-kv-cache.h"
|
||||
#include "llama-model-loader.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-quant.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
@@ -18,12 +17,8 @@
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cfloat>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <cstdarg>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
@@ -31,10 +26,7 @@
|
||||
#include <ctime>
|
||||
#include <functional>
|
||||
#include <initializer_list>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -2540,21 +2532,36 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
struct ggml_context * ctx,
|
||||
struct llama_context & lctx,
|
||||
const llama_hparams & hparams,
|
||||
const llama_ubatch & batch,
|
||||
const llama_ubatch & ubatch,
|
||||
struct ggml_tensor * tok_embd,
|
||||
const llm_build_cb & cb) {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (batch.token) {
|
||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||
if (ubatch.token) {
|
||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
cb(lctx.inp_tokens, "inp_tokens", -1);
|
||||
ggml_set_input(lctx.inp_tokens);
|
||||
|
||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||
|
||||
// apply lora for embedding tokens if needed
|
||||
for (auto & it : lctx.lora_adapters) {
|
||||
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
|
||||
if (lora == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const float adapter_scale = it.second;
|
||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
||||
ctx, lora->b, // non-transposed lora_b
|
||||
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
|
||||
), scale);
|
||||
inpL = ggml_add(ctx, inpL, inpL_delta);
|
||||
}
|
||||
} else {
|
||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
||||
inpL = lctx.inp_embd;
|
||||
ggml_set_input(lctx.inp_embd);
|
||||
}
|
||||
@@ -2625,9 +2632,8 @@ static struct ggml_tensor * llm_build_lora_mm(
|
||||
if (lora == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const float alpha = it.first->alpha;
|
||||
const float rank = (float) lora->b->ne[0];
|
||||
const float scale = alpha ? it.second * alpha / rank : it.second;
|
||||
const float adapter_scale = it.second;
|
||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
||||
ctx0, lora->b,
|
||||
ggml_mul_mat(ctx0, lora->a, cur)
|
||||
@@ -3149,7 +3155,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
|
||||
static struct ggml_tensor * llm_build_mamba(
|
||||
struct ggml_context * ctx,
|
||||
struct llama_context & lctx,
|
||||
const llama_ubatch & batch,
|
||||
const llama_ubatch & ubatch,
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * state_copy,
|
||||
@@ -3165,17 +3171,17 @@ static struct ggml_tensor * llm_build_mamba(
|
||||
const int64_t d_inner = hparams.ssm_d_inner;
|
||||
const int64_t d_state = hparams.ssm_d_state;
|
||||
const int64_t dt_rank = hparams.ssm_dt_rank;
|
||||
const int64_t n_seqs = batch.n_seqs;
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
||||
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
||||
// Use the same RMS norm as the final layer norm
|
||||
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
||||
|
||||
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
struct ggml_tensor * conv_states_all = kv.k_l[il];
|
||||
struct ggml_tensor * ssm_states_all = kv.v_l[il];
|
||||
@@ -3975,6 +3981,7 @@ struct llm_build_context {
|
||||
|
||||
// feed-forward network
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
@@ -11519,13 +11526,7 @@ int32_t llama_lora_adapter_set(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter,
|
||||
float scale) {
|
||||
if (ctx->cparams.flash_attn) {
|
||||
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
ctx->lora_adapters[adapter] = scale;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -12440,16 +12441,16 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
std::string str_split_path(split_path);
|
||||
char postfix[32];
|
||||
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
std::string str_postfix(postfix);
|
||||
|
||||
// check if dest ends with postfix
|
||||
// check if split_prefix ends with postfix
|
||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||
return size_prefix;
|
||||
}
|
||||
|
||||
@@ -12458,6 +12459,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||
|
||||
const char * llama_print_system_info(void) {
|
||||
static std::string s;
|
||||
s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
|
||||
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
auto * reg = ggml_backend_reg_get(i);
|
||||
|
||||
@@ -15,66 +15,71 @@ constexpr int offset_has_tensors = 2000;
|
||||
constexpr int offset_has_data = 3000;
|
||||
|
||||
enum handcrafted_file_type {
|
||||
HANDCRAFTED_HEADER_BAD_MAGIC = 10,
|
||||
HANDCRAFTED_HEADER_BAD_VERSION_1 = 20,
|
||||
HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30,
|
||||
HANDCRAFTED_HEADER_BAD_N_TENSORS = 40,
|
||||
HANDCRAFTED_HEADER_BAD_N_KV = 50,
|
||||
HANDCRAFTED_HEADER_EMPTY = 800,
|
||||
HANDCRAFTED_HEADER_BAD_MAGIC = 10,
|
||||
HANDCRAFTED_HEADER_BAD_VERSION_1 = 20,
|
||||
HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30,
|
||||
HANDCRAFTED_HEADER_BAD_N_TENSORS = 40,
|
||||
HANDCRAFTED_HEADER_BAD_N_KV = 50,
|
||||
HANDCRAFTED_HEADER_EMPTY = 800,
|
||||
|
||||
HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv,
|
||||
HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv,
|
||||
HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv,
|
||||
HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv,
|
||||
HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv,
|
||||
HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv,
|
||||
HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv,
|
||||
// HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv, // removed because it can result in allocations > 1 TB (default sanitizer limit)
|
||||
HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv,
|
||||
HANDCRAFTED_KV_BAD_ALIGN = 50 + offset_has_kv,
|
||||
HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv,
|
||||
|
||||
HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_ALIGNMENT = 80 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_BAD_ALIGN = 75 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN = 80 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors,
|
||||
HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors,
|
||||
|
||||
HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data,
|
||||
HANDCRAFTED_DATA_BAD_ALIGNMENT = 20 + offset_has_data,
|
||||
HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data,
|
||||
HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
|
||||
HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data,
|
||||
HANDCRAFTED_DATA_BAD_ALIGN = 15 + offset_has_data,
|
||||
HANDCRAFTED_DATA_INCONSISTENT_ALIGN = 20 + offset_has_data,
|
||||
HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data,
|
||||
HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
|
||||
};
|
||||
|
||||
std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
|
||||
switch (hft) {
|
||||
case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC";
|
||||
case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1";
|
||||
case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE";
|
||||
case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV";
|
||||
case HANDCRAFTED_HEADER_BAD_N_TENSORS: return "HEADER_BAD_N_TENSORS";
|
||||
case HANDCRAFTED_HEADER_EMPTY: return "HEADER_EMPTY";
|
||||
case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC";
|
||||
case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1";
|
||||
case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE";
|
||||
case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV";
|
||||
case HANDCRAFTED_HEADER_BAD_N_TENSORS: return "HEADER_BAD_N_TENSORS";
|
||||
case HANDCRAFTED_HEADER_EMPTY: return "HEADER_EMPTY";
|
||||
|
||||
case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE";
|
||||
case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE";
|
||||
case HANDCRAFTED_KV_BAD_VALUE_SIZE: return "KV_BAD_VALUE_SIZE";
|
||||
case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY";
|
||||
case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV";
|
||||
case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE";
|
||||
case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE";
|
||||
case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY";
|
||||
case HANDCRAFTED_KV_BAD_ALIGN: return "KV_BAD_ALIGN";
|
||||
case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV";
|
||||
|
||||
case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE";
|
||||
case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS";
|
||||
case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE";
|
||||
case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG";
|
||||
case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE";
|
||||
case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET";
|
||||
case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME";
|
||||
case HANDCRAFTED_TENSORS_BAD_ALIGNMENT: return "TENSORS_BAD_ALIGNMENT";
|
||||
case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS";
|
||||
case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN";
|
||||
case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE";
|
||||
case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS";
|
||||
case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE";
|
||||
case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG";
|
||||
case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE";
|
||||
case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET";
|
||||
case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME";
|
||||
case HANDCRAFTED_TENSORS_BAD_ALIGN: return "TENSORS_BAD_ALIGN";
|
||||
case HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN: return "TENSORS_INCONSISTENT_ALIGN";
|
||||
case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS";
|
||||
case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN";
|
||||
|
||||
case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA";
|
||||
case HANDCRAFTED_DATA_BAD_ALIGNMENT: return "DATA_BAD_ALIGNMENT";
|
||||
case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS";
|
||||
case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN";
|
||||
case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA";
|
||||
case HANDCRAFTED_DATA_BAD_ALIGN: return "DATA_BAD_ALIGN";
|
||||
case HANDCRAFTED_DATA_INCONSISTENT_ALIGN: return "DATA_INCONSISTENT_ALIGN";
|
||||
case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS";
|
||||
case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN";
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -140,31 +145,41 @@ std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937
|
||||
return kv_types;
|
||||
}
|
||||
|
||||
static void helper_write(const void * data, const size_t nbytes, FILE * file) {
|
||||
template <typename T>
|
||||
static void helper_write(FILE * file, const T & val) {
|
||||
GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val));
|
||||
}
|
||||
|
||||
static void helper_write(FILE * file, const void * data, const size_t nbytes) {
|
||||
GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
|
||||
}
|
||||
|
||||
static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
|
||||
FILE * file = tmpfile();
|
||||
|
||||
if (!file) {
|
||||
return file;
|
||||
}
|
||||
|
||||
std::mt19937 rng(seed);
|
||||
uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||
|
||||
if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
|
||||
const char bad_magic[4] = {'F', 'U', 'G', 'G'};
|
||||
helper_write(bad_magic, sizeof(bad_magic), file);
|
||||
helper_write(file, bad_magic, sizeof(bad_magic));
|
||||
} else {
|
||||
helper_write(GGUF_MAGIC, 4, file);
|
||||
helper_write(file, GGUF_MAGIC, 4);
|
||||
}
|
||||
|
||||
if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
|
||||
const uint32_t version = 1;
|
||||
helper_write(&version, sizeof(version), file);
|
||||
helper_write(file, version);
|
||||
} else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
|
||||
const uint32_t version = GGUF_VERSION + 1;
|
||||
helper_write(&version, sizeof(version), file);
|
||||
helper_write(file, version);
|
||||
} else {
|
||||
const uint32_t version = GGUF_VERSION;
|
||||
helper_write(&version, sizeof(version), file);
|
||||
helper_write(file, version);
|
||||
}
|
||||
|
||||
std::vector<tensor_config_t> tensor_configs;
|
||||
@@ -174,10 +189,10 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
|
||||
if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
|
||||
const uint64_t n_tensors = -1;
|
||||
helper_write(&n_tensors, sizeof(n_tensors), file);
|
||||
helper_write(file, n_tensors);
|
||||
} else {
|
||||
const uint64_t n_tensors = tensor_configs.size();
|
||||
helper_write(&n_tensors, sizeof(n_tensors), file);
|
||||
helper_write(file, n_tensors);
|
||||
}
|
||||
|
||||
std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
|
||||
@@ -186,41 +201,49 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
}
|
||||
{
|
||||
uint64_t n_kv = kv_types.size();
|
||||
if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||
if (hft == HANDCRAFTED_KV_BAD_ALIGN ||
|
||||
hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
|
||||
hft == HANDCRAFTED_DATA_BAD_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||
|
||||
n_kv += 1;
|
||||
} else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
|
||||
n_kv = -1;
|
||||
}
|
||||
helper_write(&n_kv, sizeof(n_kv), file);
|
||||
helper_write(file, n_kv);
|
||||
}
|
||||
|
||||
if (hft < offset_has_kv) {
|
||||
while (ftell(file) % alignment != 0) {
|
||||
const char pad = 0;
|
||||
helper_write(file, pad);
|
||||
}
|
||||
|
||||
for (int i = 0; i < extra_bytes; ++i) {
|
||||
const char tmp = 0;
|
||||
helper_write(&tmp, sizeof(tmp), file);
|
||||
helper_write(file, tmp);
|
||||
}
|
||||
rewind(file);
|
||||
return file;
|
||||
}
|
||||
|
||||
for (int i = 0; i < int(kv_types.size()); ++i) {
|
||||
const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].first);
|
||||
const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].second);
|
||||
const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].first);
|
||||
const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].second);
|
||||
|
||||
const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
|
||||
|
||||
if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
|
||||
const uint64_t n = -1;
|
||||
helper_write(&n, sizeof(n), file);
|
||||
helper_write(file, n);
|
||||
} else {
|
||||
const uint64_t n = key.length();
|
||||
helper_write(&n, sizeof(n), file);
|
||||
helper_write(file, n);
|
||||
}
|
||||
helper_write(key.data(), key.length(), file);
|
||||
helper_write(file, key.data(), key.length());
|
||||
|
||||
{
|
||||
const int32_t type32 = int32_t(type);
|
||||
helper_write(&type32, sizeof(type32), file);
|
||||
helper_write(file, type32);
|
||||
}
|
||||
|
||||
uint32_t data[16];
|
||||
@@ -233,69 +256,67 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
|
||||
if (type == GGUF_TYPE_STRING) {
|
||||
const uint64_t n = rng() % sizeof(data);
|
||||
helper_write(&n, sizeof(n), file);
|
||||
helper_write(data, n, file);
|
||||
helper_write(file, n);
|
||||
helper_write(file, data, n);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (type == GGUF_TYPE_ARRAY) {
|
||||
{
|
||||
const int32_t type32 = int32_t(type_arr);
|
||||
helper_write(&type32, sizeof(type32), file);
|
||||
helper_write(file, type32);
|
||||
}
|
||||
if (type_arr == GGUF_TYPE_STRING) {
|
||||
const uint64_t nstr = rng() % (16 + 1);
|
||||
helper_write(&nstr, sizeof(nstr), file);
|
||||
helper_write(file, nstr);
|
||||
for (uint64_t istr = 0; istr < nstr; ++istr) {
|
||||
const uint64_t n = rng() % (sizeof(uint32_t) + 1);
|
||||
helper_write(&n, sizeof(n), file);
|
||||
helper_write(&data[istr], n, file);
|
||||
helper_write(file, n);
|
||||
helper_write(file, &data[istr], n);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const size_t type_size = gguf_type_size(type_arr);
|
||||
const uint64_t n = (rng() % sizeof(data)) / type_size;
|
||||
helper_write(&n, sizeof(n), file);
|
||||
helper_write(&data, n*type_size, file);
|
||||
helper_write(file, n);
|
||||
helper_write(file, &data, n*type_size);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t type_size = hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type);
|
||||
if (hft == HANDCRAFTED_KV_BAD_VALUE_SIZE) {
|
||||
type_size += rng() % 3;
|
||||
}
|
||||
helper_write(data, type_size, file);
|
||||
helper_write(file, data, hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type));
|
||||
}
|
||||
|
||||
if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||
const std::string key = "general.alignment";
|
||||
{
|
||||
const uint64_t n = key.length();
|
||||
helper_write(&n, sizeof(n), file);
|
||||
}
|
||||
helper_write(key.data(), key.length(), file);
|
||||
if (hft == HANDCRAFTED_KV_BAD_ALIGN ||
|
||||
hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
|
||||
hft == HANDCRAFTED_DATA_BAD_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||
|
||||
const uint64_t n = strlen(GGUF_KEY_GENERAL_ALIGNMENT);
|
||||
helper_write(file, n);
|
||||
helper_write(file, GGUF_KEY_GENERAL_ALIGNMENT, n);
|
||||
|
||||
const int32_t type = gguf_type(GGUF_TYPE_UINT32);
|
||||
helper_write(&type, sizeof(type), file);
|
||||
helper_write(file, type);
|
||||
|
||||
const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT + 1;
|
||||
helper_write(&alignment, sizeof(alignment), file);
|
||||
alignment = expect_context_not_null(hft) ? 1 : 13;
|
||||
helper_write(file, alignment);
|
||||
}
|
||||
|
||||
if (hft < offset_has_tensors) {
|
||||
while (ftell(file) % alignment != 0) {
|
||||
const char pad = 0;
|
||||
helper_write(file, pad);
|
||||
}
|
||||
|
||||
for (int i = 0; i < extra_bytes; ++i) {
|
||||
const char tmp = 0;
|
||||
helper_write(&tmp, sizeof(tmp), file);
|
||||
helper_write(file, tmp);
|
||||
}
|
||||
rewind(file);
|
||||
return file;
|
||||
}
|
||||
|
||||
uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||
if (hft == HANDCRAFTED_TENSORS_BAD_ALIGNMENT || hft == HANDCRAFTED_DATA_BAD_ALIGNMENT) {
|
||||
alignment -= 1;
|
||||
} else if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
|
||||
alignment += 1;
|
||||
if (hft == HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN || hft == HANDCRAFTED_DATA_INCONSISTENT_ALIGN) {
|
||||
alignment = 1;
|
||||
}
|
||||
|
||||
uint64_t offset = 0;
|
||||
@@ -313,9 +334,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
}
|
||||
{
|
||||
const uint64_t n = name.length();
|
||||
helper_write(&n, sizeof(n), file);
|
||||
helper_write(file, n);
|
||||
}
|
||||
helper_write(name.data(), name.length(), file);
|
||||
helper_write(file, name.data(), name.length());
|
||||
|
||||
uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1;
|
||||
for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
|
||||
@@ -326,35 +347,35 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
}
|
||||
if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
|
||||
const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
|
||||
helper_write(&n_dims_bad, sizeof(n_dims_bad), file);
|
||||
helper_write(file, n_dims_bad);
|
||||
} else {
|
||||
helper_write(&n_dims, sizeof(n_dims), file);
|
||||
helper_write(file, n_dims);
|
||||
}
|
||||
|
||||
if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
|
||||
for (uint32_t j = 0; j < n_dims; ++j) {
|
||||
const int64_t bad_dim = -1;
|
||||
helper_write(&bad_dim, sizeof(bad_dim), file);
|
||||
helper_write(file, bad_dim);
|
||||
}
|
||||
} else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
|
||||
for (uint32_t j = 0; j < n_dims; ++j) {
|
||||
const int64_t big_dim = 4*int64_t(INT32_MAX);
|
||||
helper_write(&big_dim, sizeof(big_dim), file);
|
||||
helper_write(file, big_dim);
|
||||
}
|
||||
} else {
|
||||
helper_write(shape.data(), n_dims*sizeof(int64_t), file);
|
||||
helper_write(file, shape.data(), n_dims*sizeof(int64_t));
|
||||
}
|
||||
|
||||
{
|
||||
const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? -1 : int32_t(type);
|
||||
helper_write(&type32, sizeof(type32), file);
|
||||
const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? GGML_TYPE_COUNT : int32_t(type);
|
||||
helper_write(file, type32);
|
||||
}
|
||||
|
||||
if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
|
||||
const uint64_t bad_offset = -1;
|
||||
helper_write(&bad_offset, sizeof(bad_offset), file);
|
||||
helper_write(file, bad_offset);
|
||||
} else {
|
||||
helper_write(&offset, sizeof(offset), file);
|
||||
helper_write(file, offset);
|
||||
}
|
||||
|
||||
int64_t ne = shape[0];
|
||||
@@ -364,12 +385,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
offset += GGML_PAD(ggml_row_size(type, ne), alignment);
|
||||
}
|
||||
|
||||
const uint32_t alignment_overshoot = ftell(file) % alignment;
|
||||
if (alignment_overshoot != 0) {
|
||||
for (size_t i = alignment_overshoot; i < alignment; ++i) {
|
||||
const char pad = 0;
|
||||
helper_write(&pad, sizeof(pad), file);
|
||||
}
|
||||
while (ftell(file) % alignment != 0) {
|
||||
const char pad = 0;
|
||||
helper_write(file, pad);
|
||||
}
|
||||
|
||||
if (hft >= offset_has_data) {
|
||||
@@ -380,13 +398,13 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
||||
}
|
||||
for (uint64_t i = 0; i < nbytes; ++i) {
|
||||
const uint8_t random_byte = i % 256;
|
||||
helper_write(&random_byte, sizeof(random_byte), file);
|
||||
helper_write(file, random_byte);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < extra_bytes; ++i) {
|
||||
const char tmp = 0;
|
||||
helper_write(&tmp, sizeof(tmp), file);
|
||||
helper_write(file, tmp);
|
||||
}
|
||||
rewind(file);
|
||||
return file;
|
||||
@@ -505,6 +523,16 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i
|
||||
}
|
||||
|
||||
const char * data_gguf = reinterpret_cast<const char *>(gguf_get_arr_data(gguf_ctx, id));
|
||||
|
||||
if (type_arr == GGUF_TYPE_BOOL) {
|
||||
for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
if (bool(data8[arr_i]) != bool(data_gguf[arr_i])) {
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
|
||||
ok = false;
|
||||
}
|
||||
@@ -512,12 +540,20 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i
|
||||
}
|
||||
|
||||
const char * data_gguf = reinterpret_cast<const char *>(gguf_get_val_data(gguf_ctx, id));
|
||||
|
||||
if (type == GGUF_TYPE_BOOL) {
|
||||
if (bool(*data8) != bool(*data_gguf)) {
|
||||
ok = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t expected_alignment = alignment_defined ? GGUF_DEFAULT_ALIGNMENT + 1 : GGUF_DEFAULT_ALIGNMENT;
|
||||
const uint32_t expected_alignment = alignment_defined ? 1 : GGUF_DEFAULT_ALIGNMENT;
|
||||
if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
|
||||
ok = false;
|
||||
}
|
||||
@@ -539,7 +575,7 @@ static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsig
|
||||
|
||||
bool ok = true;
|
||||
|
||||
const int id_alignment = gguf_find_key(gguf_ctx, "general.alignment");
|
||||
const int id_alignment = gguf_find_key(gguf_ctx, GGUF_KEY_GENERAL_ALIGNMENT);
|
||||
const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
|
||||
|
||||
uint64_t expected_offset = 0;
|
||||
@@ -607,7 +643,7 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u
|
||||
|
||||
std::vector<uint8_t> data(size);
|
||||
GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
|
||||
GGML_ASSERT(fread(data.data(), 1, size, file) == size);
|
||||
GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
|
||||
|
||||
for (size_t j = 0; j < size; ++j) {
|
||||
const uint8_t expected_byte = (j + offset) % 256;
|
||||
@@ -627,15 +663,15 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
const std::vector<handcrafted_file_type> hfts = {
|
||||
HANDCRAFTED_HEADER_BAD_MAGIC,
|
||||
HANDCRAFTED_HEADER_BAD_VERSION_1,
|
||||
// HANDCRAFTED_FILE_TYPE_BAD_VERSION_FUTURE, // FIXME
|
||||
HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
|
||||
HANDCRAFTED_HEADER_BAD_N_KV,
|
||||
HANDCRAFTED_HEADER_BAD_N_TENSORS,
|
||||
HANDCRAFTED_HEADER_EMPTY,
|
||||
|
||||
HANDCRAFTED_KV_BAD_KEY_SIZE,
|
||||
HANDCRAFTED_KV_BAD_TYPE,
|
||||
// HANDCRAFTED_KV_BAD_VALUE_SIZE, // FIXME sanitizer limit
|
||||
// HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME
|
||||
HANDCRAFTED_KV_DUPLICATE_KEY,
|
||||
HANDCRAFTED_KV_BAD_ALIGN,
|
||||
HANDCRAFTED_KV_SUCCESS,
|
||||
|
||||
HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
|
||||
@@ -643,14 +679,16 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
HANDCRAFTED_TENSORS_BAD_SHAPE,
|
||||
HANDCRAFTED_TENSORS_NE_TOO_BIG,
|
||||
HANDCRAFTED_TENSORS_BAD_TYPE,
|
||||
// HANDCRAFTED_TENSORS_BAD_OFFSET, // FIXME
|
||||
HANDCRAFTED_TENSORS_BAD_OFFSET,
|
||||
HANDCRAFTED_TENSORS_DUPLICATE_NAME,
|
||||
// HANDCRAFTED_TENSORS_BAD_ALIGNMENT, // FIXME
|
||||
HANDCRAFTED_TENSORS_BAD_ALIGN,
|
||||
HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN,
|
||||
HANDCRAFTED_TENSORS_SUCCESS,
|
||||
HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
|
||||
|
||||
HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
|
||||
// HANDCRAFTED_DATA_BAD_ALIGNMENT, // FIXME
|
||||
HANDCRAFTED_DATA_BAD_ALIGN,
|
||||
HANDCRAFTED_DATA_INCONSISTENT_ALIGN,
|
||||
HANDCRAFTED_DATA_SUCCESS,
|
||||
HANDCRAFTED_DATA_CUSTOM_ALIGN,
|
||||
};
|
||||
@@ -674,6 +712,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
||||
};
|
||||
|
||||
struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
|
||||
|
||||
if (expect_context_not_null(hft)) {
|
||||
@@ -689,7 +728,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
}
|
||||
ntest++;
|
||||
|
||||
if (false && hft >= offset_has_data && !expect_context_not_null(hft)) { // FIXME
|
||||
if (hft >= offset_has_data && !expect_context_not_null(hft)) {
|
||||
printf("%s: - no_dangling_ggml_context_pointer: ", __func__);
|
||||
if (ctx) {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
@@ -700,23 +739,6 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
ntest++;
|
||||
}
|
||||
|
||||
if (false && expect_context_not_null(hft)) { // FIXME
|
||||
FILE * file_eb = get_handcrafted_file(seed, hft, /*extra_bytes =*/ 1);
|
||||
struct gguf_context * gguf_ctx_eb = gguf_init_from_file_impl(file_eb, gguf_params);
|
||||
|
||||
printf("%s: - context_null_with_extra_bytes: ", __func__);
|
||||
if (gguf_ctx_eb) {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
} else {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
npass++;
|
||||
}
|
||||
ntest++;
|
||||
|
||||
gguf_free(gguf_ctx_eb);
|
||||
fclose(file_eb);
|
||||
}
|
||||
|
||||
const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
|
||||
|
||||
if (expect_context_not_null(hft)) {
|
||||
@@ -763,14 +785,15 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
ntest++;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
if (gguf_ctx) {
|
||||
ggml_free(ctx);
|
||||
gguf_free(gguf_ctx);
|
||||
}
|
||||
fclose(file);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
@@ -789,10 +812,6 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
|
||||
const std::string key = "my_key_" + std::to_string(rng() % 1024);
|
||||
const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
|
||||
|
||||
if (type == GGUF_TYPE_STRING || type == GGUF_TYPE_ARRAY) {
|
||||
continue; // FIXME memory leak
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: gguf_set_val_u8 (gguf_ctx, key.c_str(), rng() % (1 << 7)); break;
|
||||
case GGUF_TYPE_INT8: gguf_set_val_i8 (gguf_ctx, key.c_str(), rng() % (1 << 7) - (1 << 6)); break;
|
||||
@@ -826,6 +845,9 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
|
||||
std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
|
||||
for (size_t j = 0; j < random_data.size(); ++j) {
|
||||
random_data[j] = rng();
|
||||
if (type_arr == GGUF_TYPE_BOOL) {
|
||||
random_data[j] &= 0x01010101; // the sanitizer complains if booleans are not 0 or 1
|
||||
}
|
||||
}
|
||||
gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
|
||||
} break;
|
||||
@@ -928,6 +950,17 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
||||
continue;
|
||||
}
|
||||
|
||||
if (type_arr == GGUF_TYPE_BOOL) {
|
||||
const int8_t * data = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx, id));
|
||||
const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
|
||||
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
if (bool(data[arr_i]) != bool(data_other[arr_i])) {
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (type_arr == GGUF_TYPE_STRING) {
|
||||
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
const std::string str = gguf_get_arr_str(ctx, id, arr_i);
|
||||
@@ -939,8 +972,8 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
||||
continue;
|
||||
}
|
||||
|
||||
const char * data = reinterpret_cast<const char *>(gguf_get_arr_data(ctx, id));
|
||||
const char * data_other = reinterpret_cast<const char *>(gguf_get_arr_data(other, idx_other));
|
||||
const int8_t * data = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx, id));
|
||||
const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
|
||||
if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
|
||||
ok = false;
|
||||
}
|
||||
@@ -1028,21 +1061,6 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
|
||||
FILE * file = tmpfile();
|
||||
#ifdef _WIN32
|
||||
if (!file) {
|
||||
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
|
||||
printf("%s: skipping tests");
|
||||
return std::make_pair(0, 0);
|
||||
}
|
||||
#else
|
||||
GGML_ASSERT(file);
|
||||
#endif // _WIN32
|
||||
|
||||
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
return std::make_pair(0, 0); // FIXME
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
||||
printf("%s: device=%s, backend=%s, only_meta=%s\n",
|
||||
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
|
||||
@@ -1060,10 +1078,24 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
||||
bbuf = result.buffer;
|
||||
}
|
||||
|
||||
struct gguf_buf gbuf = gguf_buf_init(16 * 1024);
|
||||
gguf_write_to_buf(gguf_ctx_0, &gbuf, only_meta);
|
||||
helper_write(gbuf.data, gbuf.offset, file);
|
||||
rewind(file);
|
||||
FILE * file = tmpfile();
|
||||
|
||||
#ifdef _WIN32
|
||||
if (!file) {
|
||||
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
|
||||
printf("%s: skipping tests");
|
||||
return std::make_pair(0, 0);
|
||||
}
|
||||
#else
|
||||
GGML_ASSERT(file);
|
||||
#endif // _WIN32
|
||||
|
||||
{
|
||||
std::vector<int8_t> buf;
|
||||
gguf_write_to_buf(gguf_ctx_0, buf, only_meta);
|
||||
GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size());
|
||||
rewind(file);
|
||||
}
|
||||
|
||||
struct ggml_context * ctx_1 = nullptr;
|
||||
struct gguf_init_params gguf_params = {
|
||||
@@ -1151,9 +1183,8 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
||||
ggml_free(ctx_1);
|
||||
gguf_free(gguf_ctx_0);
|
||||
gguf_free(gguf_ctx_1);
|
||||
gguf_buf_free(gbuf);
|
||||
ggml_backend_free(backend);
|
||||
GGML_ASSERT(fclose(file) == 0);
|
||||
fclose(file);
|
||||
|
||||
printf("\n");
|
||||
return std::make_pair(npass, ntest);
|
||||
|
||||
Reference in New Issue
Block a user