mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-02 15:14:06 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5ca0944a15 | ||
|
|
adc9ff3841 | ||
|
|
987d743d6b | ||
|
|
b226c1227b | ||
|
|
3b38d48609 | ||
|
|
6d1616944d | ||
|
|
bde7cd3cd9 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
|
||||
lcov-report/
|
||||
gcovr-report/
|
||||
|
||||
tags
|
||||
build*
|
||||
!build.zig
|
||||
cmake-build-*
|
||||
android-ndk-*
|
||||
out/
|
||||
tmp/
|
||||
|
||||
|
||||
@@ -557,12 +557,17 @@ if (LLAMA_VULKAN)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
if ($ENV{ROCM_PATH})
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
if (NOT EXISTS $ENV{ROCM_PATH})
|
||||
if (NOT EXISTS /opt/rocm)
|
||||
set(ROCM_PATH /usr)
|
||||
else()
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
endif()
|
||||
else()
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
endif()
|
||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||
|
||||
# CMake on Windows doesn't support the HIP language yet
|
||||
if(WIN32)
|
||||
|
||||
29
Makefile
29
Makefile
@@ -69,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
BUILD_TARGETS += rpc-server
|
||||
endif
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
@@ -429,6 +433,11 @@ ifdef LLAMA_BLIS
|
||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||
OBJS += ggml-rpc.o
|
||||
endif # LLAMA_RPC
|
||||
|
||||
ifdef LLAMA_CUBLAS
|
||||
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
||||
LLAMA_CUDA := 1
|
||||
@@ -654,11 +663,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||
|
||||
ifndef LLAMA_NO_LLAMAFILE
|
||||
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif # LLAMA_RPC
|
||||
|
||||
GF_CC := $(CC)
|
||||
include scripts/get-flags.mk
|
||||
|
||||
@@ -738,14 +762,9 @@ unicode.o: unicode.cpp unicode.h
|
||||
unicode-data.o: unicode-data.cpp unicode-data.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||
|
||||
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||
|
||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
|
||||
11
README.md
11
README.md
@@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options.
|
||||
cmake --build build --config Debug
|
||||
```
|
||||
|
||||
- Using `Zig` (version 0.11 or later):
|
||||
|
||||
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
|
||||
it's also possible to cross compile for other operating systems and architectures:
|
||||
|
||||
```bash
|
||||
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
|
||||
```
|
||||
|
||||
The `zig targets` command will give you valid options to use.
|
||||
|
||||
- Using `gmake` (FreeBSD):
|
||||
|
||||
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
||||
|
||||
@@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
|
||||
@@ -140,10 +140,11 @@ static std::string get_gpu_info() {
|
||||
}
|
||||
|
||||
// command line params
|
||||
enum output_formats {CSV, JSON, MARKDOWN, SQL};
|
||||
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
||||
|
||||
static const char * output_format_str(output_formats format) {
|
||||
switch (format) {
|
||||
case NONE: return "none";
|
||||
case CSV: return "csv";
|
||||
case JSON: return "json";
|
||||
case MARKDOWN: return "md";
|
||||
@@ -152,6 +153,23 @@ static const char * output_format_str(output_formats format) {
|
||||
}
|
||||
}
|
||||
|
||||
static bool output_format_from_str(const std::string & s, output_formats & format) {
|
||||
if (s == "none") {
|
||||
format = NONE;
|
||||
} else if (s == "csv") {
|
||||
format = CSV;
|
||||
} else if (s == "json") {
|
||||
format = JSON;
|
||||
} else if (s == "md") {
|
||||
format = MARKDOWN;
|
||||
} else if (s == "sql") {
|
||||
format = SQL;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * split_mode_str(llama_split_mode mode) {
|
||||
switch (mode) {
|
||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||
@@ -190,31 +208,33 @@ struct cmd_params {
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
};
|
||||
|
||||
static const cmd_params cmd_params_defaults = {
|
||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||
/* n_prompt */ {512},
|
||||
/* n_gen */ {128},
|
||||
/* n_pg */ {},
|
||||
/* n_batch */ {2048},
|
||||
/* n_ubatch */ {512},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* flash_attn */ {false},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||
/* n_prompt */ {512},
|
||||
/* n_gen */ {128},
|
||||
/* n_pg */ {},
|
||||
/* n_batch */ {2048},
|
||||
/* n_ubatch */ {512},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* flash_attn */ {false},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
};
|
||||
|
||||
static void print_usage(int /* argc */, char ** argv) {
|
||||
@@ -243,6 +263,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf("\n");
|
||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||
@@ -284,6 +305,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
|
||||
params.verbose = cmd_params_defaults.verbose;
|
||||
params.output_format = cmd_params_defaults.output_format;
|
||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||
params.reps = cmd_params_defaults.reps;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
@@ -493,18 +515,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (argv[i] == std::string("csv")) {
|
||||
params.output_format = CSV;
|
||||
} else if (argv[i] == std::string("json")) {
|
||||
params.output_format = JSON;
|
||||
} else if (argv[i] == std::string("md")) {
|
||||
params.output_format = MARKDOWN;
|
||||
} else if (argv[i] == std::string("sql")) {
|
||||
params.output_format = SQL;
|
||||
} else {
|
||||
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
||||
} else if (arg == "-oe" || arg == "--output-err") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
params.verbose = true;
|
||||
} else {
|
||||
@@ -1278,6 +1295,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
switch (format) {
|
||||
case NONE:
|
||||
return nullptr;
|
||||
case CSV:
|
||||
return std::unique_ptr<printer>(new csv_printer());
|
||||
case JSON:
|
||||
return std::unique_ptr<printer>(new json_printer());
|
||||
case MARKDOWN:
|
||||
return std::unique_ptr<printer>(new markdown_printer());
|
||||
case SQL:
|
||||
return std::unique_ptr<printer>(new sql_printer());
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// try to set locale for unicode characters in markdown
|
||||
setlocale(LC_CTYPE, ".UTF-8");
|
||||
@@ -1304,26 +1337,18 @@ int main(int argc, char ** argv) {
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// initialize printer
|
||||
std::unique_ptr<printer> p;
|
||||
switch (params.output_format) {
|
||||
case CSV:
|
||||
p.reset(new csv_printer());
|
||||
break;
|
||||
case JSON:
|
||||
p.reset(new json_printer());
|
||||
break;
|
||||
case MARKDOWN:
|
||||
p.reset(new markdown_printer());
|
||||
break;
|
||||
case SQL:
|
||||
p.reset(new sql_printer());
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
exit(1);
|
||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||
|
||||
if (p) {
|
||||
p->fout = stdout;
|
||||
p->print_header(params);
|
||||
}
|
||||
|
||||
if (p_err) {
|
||||
p_err->fout = stderr;
|
||||
p_err->print_header(params);
|
||||
}
|
||||
p->fout = stdout;
|
||||
p->print_header(params);
|
||||
|
||||
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
||||
|
||||
@@ -1381,7 +1406,15 @@ int main(int argc, char ** argv) {
|
||||
t.samples_ns.push_back(t_ns);
|
||||
}
|
||||
|
||||
p->print_test(t);
|
||||
if (p) {
|
||||
p->print_test(t);
|
||||
fflush(p->fout);
|
||||
}
|
||||
|
||||
if (p_err) {
|
||||
p_err->print_test(t);
|
||||
fflush(p_err->fout);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
@@ -1390,7 +1423,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
||||
p->print_footer();
|
||||
if (p) {
|
||||
p->print_footer();
|
||||
}
|
||||
|
||||
if (p_err) {
|
||||
p_err->print_footer();
|
||||
}
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
// this tensor was allocated without ggml-backend
|
||||
return;
|
||||
}
|
||||
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
||||
ggml_backend_view_init(tensor);
|
||||
}
|
||||
} else {
|
||||
if (tensor->data == NULL) {
|
||||
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
if (t->view_src == NULL) {
|
||||
ggml_tallocr_alloc(&tallocr, t);
|
||||
} else if (t->buffer == NULL) {
|
||||
ggml_backend_view_init(buffer, t);
|
||||
ggml_backend_view_init(t);
|
||||
}
|
||||
} else {
|
||||
if (t->view_src != NULL && t->buffer == NULL) {
|
||||
// view of a pre-allocated tensor
|
||||
ggml_backend_view_init(buffer, t);
|
||||
ggml_backend_view_init(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
if (dst_buf->iface.cpy_tensor) {
|
||||
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
||||
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
||||
|
||||
// utils
|
||||
|
||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->buffer == NULL);
|
||||
GGML_ASSERT(tensor->view_src != NULL);
|
||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->buffer = tensor->view_src->buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||
struct ggml_tensor * dst = node_copies[id];
|
||||
if (dst->view_src != NULL) {
|
||||
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||
ggml_backend_view_init(dst->view_src->buffer, dst);
|
||||
ggml_backend_view_init(dst);
|
||||
}
|
||||
else {
|
||||
ggml_backend_tensor_copy(src, dst);
|
||||
|
||||
@@ -225,7 +225,7 @@ extern "C" {
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||
if (remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
||||
remote_size);
|
||||
return buffer;
|
||||
} else {
|
||||
@@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC",
|
||||
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||
};
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
|
||||
5
ggml.c
5
ggml.c
@@ -2272,6 +2272,11 @@ inline static float ggml_silu_f32(float x) {
|
||||
return x/(1.0f + expf(-x));
|
||||
}
|
||||
|
||||
#if __FINITE_MATH_ONLY__
|
||||
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
||||
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
|
||||
// adapted from arm limited optimized routine
|
||||
|
||||
237
llama.cpp
237
llama.cpp
@@ -2149,12 +2149,12 @@ struct llama_control_vector {
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
using ttype = llama_token_type;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
ttype type;
|
||||
tattr attr;
|
||||
};
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
@@ -2371,13 +2371,34 @@ struct llama_context {
|
||||
struct llama_control_vector cvec;
|
||||
};
|
||||
|
||||
static size_t llama_get_device_count(const llama_model & model) {
|
||||
size_t count = 1;
|
||||
#if defined(GGML_USE_CUDA)
|
||||
count = ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
count = ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
count = ggml_backend_vk_get_device_count();
|
||||
#endif
|
||||
#if defined(GGML_USE_RPC)
|
||||
count += model.rpc_servers.size();
|
||||
#endif
|
||||
return count;
|
||||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
std::string endpoint = model.rpc_servers[gpu];
|
||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
||||
#elif defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_RPC)
|
||||
int dev_count = (int)llama_get_device_count(model);
|
||||
int rpc_count = (int)model.rpc_servers.size();
|
||||
if (gpu >= dev_count - rpc_count) {
|
||||
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
||||
return ggml_backend_rpc_buffer_type(endpoint);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_USE_METAL)
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
@@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_count(const llama_model & model) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
return model.rpc_servers.size();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
size_t total;
|
||||
size_t free;
|
||||
std::string endpoint = model.rpc_servers[device];
|
||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
int dev_count = (int)llama_get_device_count(model);
|
||||
int rpc_count = (int)model.rpc_servers.size();
|
||||
if (device >= dev_count - rpc_count) {
|
||||
size_t total;
|
||||
size_t free;
|
||||
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
||||
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||
return free;
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_USE_CUDA)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||
@@ -4739,7 +4750,20 @@ static void llm_load_vocab(
|
||||
auto & token_data = vocab.id_to_token[i];
|
||||
token_data.text = std::move(word);
|
||||
token_data.score = scores ? scores[i] : 0.0f;
|
||||
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
||||
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
||||
|
||||
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
||||
switch(toktypes[i]) {
|
||||
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
||||
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
||||
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
||||
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
||||
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
||||
|
||||
@@ -4830,7 +4854,7 @@ static void llm_load_vocab(
|
||||
// build special tokens cache
|
||||
{
|
||||
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
||||
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
||||
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
||||
vocab.cache_special_tokens.push_back(id);
|
||||
}
|
||||
}
|
||||
@@ -4860,6 +4884,59 @@ static void llm_load_vocab(
|
||||
|
||||
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// Handle per token attributes
|
||||
//NOTE: Each model customizes per token attributes.
|
||||
//NOTE: Per token attributes are missing from the GGUF file.
|
||||
//TODO: Extract attributes from GGUF file.
|
||||
{
|
||||
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
||||
for (auto substr : substrs) {
|
||||
if (str.find(substr) < std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
||||
uint32_t current = vocab.id_to_token.at(id).attr;
|
||||
current = value ? (current | attr) : (current & ~attr);
|
||||
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
||||
};
|
||||
|
||||
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
||||
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
||||
};
|
||||
|
||||
std::string model_name;
|
||||
std::string tokenizer_pre;
|
||||
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||
|
||||
// model name to lowercase
|
||||
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
||||
[] (const std::string::value_type x) {
|
||||
return std::tolower(x);
|
||||
}
|
||||
);
|
||||
|
||||
// set attributes by model/tokenizer name
|
||||
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
||||
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
||||
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
||||
for (auto id : vocab.cache_special_tokens) {
|
||||
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||||
}
|
||||
for (auto token : {"</s>"}) {
|
||||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||||
}
|
||||
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
||||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
@@ -12609,27 +12686,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
||||
|
||||
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
||||
}
|
||||
|
||||
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
||||
}
|
||||
|
||||
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
||||
}
|
||||
|
||||
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
||||
}
|
||||
|
||||
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
}
|
||||
|
||||
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||
@@ -13247,7 +13324,8 @@ struct fragment_buffer_variant {
|
||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||
// for each special token
|
||||
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
||||
const auto & special_token = vocab.id_to_token[special_id].text;
|
||||
const auto & data = vocab.id_to_token[special_id];
|
||||
const auto & special_token = data.text;
|
||||
|
||||
// for each text fragment
|
||||
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
||||
@@ -13284,13 +13362,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||
if (match > raw_text_base_offset) {
|
||||
// left
|
||||
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
||||
const int64_t left_reminder_length = match - raw_text_base_offset;
|
||||
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
||||
int64_t left_reminder_length = match - raw_text_base_offset;
|
||||
|
||||
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
||||
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
||||
left_reminder_length--;
|
||||
}
|
||||
}
|
||||
|
||||
if (left_reminder_length > 0) {
|
||||
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
||||
it++;
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
||||
#endif
|
||||
it++;
|
||||
}
|
||||
|
||||
// special token
|
||||
@@ -13299,16 +13386,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||
|
||||
// right
|
||||
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
||||
const int64_t right_reminder_offset = match + special_token.length();
|
||||
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
||||
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
||||
int64_t right_reminder_offset = match + special_token.length();
|
||||
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
||||
|
||||
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
||||
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
||||
right_reminder_offset++;
|
||||
right_reminder_length--;
|
||||
}
|
||||
}
|
||||
|
||||
if (right_reminder_length > 0) {
|
||||
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
||||
it++;
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
||||
#endif
|
||||
|
||||
it++;
|
||||
|
||||
if (source == 0) {
|
||||
buffer.erase_after(buffer.before_begin());
|
||||
} else {
|
||||
@@ -13354,9 +13450,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||
|
||||
static const bool rtrim = true; //TODO: as param
|
||||
bool is_prev_special = false;
|
||||
bool special_token_rtrim = false;
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
@@ -13366,25 +13460,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
|
||||
// TODO: It's likely possible to get rid of this string copy entirely
|
||||
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
||||
// and passing 'add space prefix' as bool argument
|
||||
//
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
if (special_token_rtrim) {
|
||||
size_t num_whitespaces = 0;
|
||||
while (isspace(raw_text[num_whitespaces])) {
|
||||
num_whitespaces++;
|
||||
}
|
||||
if (num_whitespaces == raw_text.size()) {
|
||||
continue; // skip if all whitespaces
|
||||
}
|
||||
raw_text = raw_text.substr(num_whitespaces);
|
||||
}
|
||||
|
||||
if (vocab.add_space_prefix) {
|
||||
if (!output.size() || is_prev_special) { // prefix with space if first token
|
||||
raw_text = " " + raw_text;
|
||||
@@ -13400,11 +13477,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
is_prev_special = true;
|
||||
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
||||
special_token_rtrim = rtrim
|
||||
&& fragment.token != vocab.special_bos_id
|
||||
&& fragment.token != vocab.special_unk_id
|
||||
&& fragment.token != vocab.special_eos_id;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16160,7 +16232,7 @@ struct llama_model * llama_load_model_from_file(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if (params.rpc_servers != nullptr) {
|
||||
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||
// split the servers set them into model->rpc_servers
|
||||
std::string servers(params.rpc_servers);
|
||||
size_t pos = 0;
|
||||
@@ -16323,17 +16395,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// initialize backends
|
||||
#if defined(GGML_USE_RPC)
|
||||
for (auto & server : model->rpc_servers) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_METAL)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ctx->backend_metal = ggml_backend_metal_init();
|
||||
if (ctx->backend_metal == nullptr) {
|
||||
@@ -16425,6 +16487,19 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_USE_RPC)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
for (const auto & endpoint : model->rpc_servers) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
@@ -18207,9 +18282,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
||||
return model->vocab.id_to_token[token].score;
|
||||
}
|
||||
|
||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
||||
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
||||
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return model->vocab.id_to_token[token].type;
|
||||
return model->vocab.id_to_token[token].attr;
|
||||
}
|
||||
|
||||
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||
|
||||
18
llama.h
18
llama.h
@@ -97,7 +97,7 @@ extern "C" {
|
||||
LLAMA_ROPE_TYPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||
@@ -107,6 +107,20 @@ extern "C" {
|
||||
LLAMA_TOKEN_TYPE_BYTE = 6,
|
||||
};
|
||||
|
||||
enum llama_token_attr {
|
||||
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1,
|
||||
LLAMA_TOKEN_ATTR_UNUSED = 1 << 2,
|
||||
LLAMA_TOKEN_ATTR_NORMAL = 1 << 3,
|
||||
LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL?
|
||||
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5,
|
||||
LLAMA_TOKEN_ATTR_BYTE = 1 << 6,
|
||||
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7,
|
||||
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8,
|
||||
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9,
|
||||
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
@@ -821,7 +835,7 @@ extern "C" {
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
||||
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
Binary file not shown.
@@ -10,16 +10,18 @@ set -x
|
||||
|
||||
bench_args="${@:3}"
|
||||
|
||||
rm -f llama-bench.sqlite
|
||||
rm -f llama-bench.sqlite > /dev/null
|
||||
|
||||
# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
|
||||
|
||||
git checkout $1
|
||||
make clean && make -j32 $make_opts llama-bench
|
||||
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
|
||||
git checkout $1 > /dev/null
|
||||
make clean > /dev/null
|
||||
make -j$(nproc) $make_opts llama-bench > /dev/null
|
||||
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
|
||||
git checkout $2
|
||||
make clean && make -j32 $make_opts llama-bench
|
||||
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
|
||||
git checkout $2 > /dev/null
|
||||
make clean > /dev/null
|
||||
make -j$(nproc) $make_opts llama-bench > /dev/null
|
||||
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
|
||||
./scripts/compare-llama-bench.py -b $1 -c $2
|
||||
|
||||
@@ -156,17 +156,39 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||
'<s>a', # Phi-3 fail
|
||||
'<unk><|endoftext|><s>', # Phi-3 fail
|
||||
'a\na', # TODO: Bert fail
|
||||
'a </s> b', # rstrip phi-3
|
||||
'a <mask> b', # lstrip jina-v2
|
||||
]
|
||||
|
||||
|
||||
def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
special_tokens = set(tokenizer.all_special_tokens)
|
||||
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
|
||||
special_tokens = list(sorted(special_tokens))
|
||||
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
||||
"""Brute force check all vocab words"""
|
||||
yield from vocab
|
||||
|
||||
|
||||
def generator_added_lr_strip(tokenizer) -> Iterator[str]:
|
||||
WHITESPACES = ["", " ", " ", " "]
|
||||
special_tokens = list(tokenizer.all_special_tokens)
|
||||
added_tokens = list(tokenizer.added_tokens_encoder)
|
||||
all_tokens = list(sorted(set(special_tokens + added_tokens)))
|
||||
for token in all_tokens:
|
||||
for lstrip in WHITESPACES:
|
||||
for rstrip in WHITESPACES:
|
||||
yield lstrip + token + rstrip
|
||||
yield "a" + lstrip + token + rstrip
|
||||
yield lstrip + token + rstrip + "z"
|
||||
yield "a" + lstrip + token + rstrip + "z"
|
||||
|
||||
|
||||
def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
special_tokens = list(tokenizer.all_special_tokens)
|
||||
added_tokens = list(tokenizer.added_tokens_encoder)
|
||||
separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
|
||||
all_tokens = list(sorted(set(special_tokens + added_tokens + separations)))
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
words = rand.choices(special_tokens, k=500)
|
||||
words = rand.choices(all_tokens, k=500)
|
||||
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
|
||||
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
|
||||
words.pop(0)
|
||||
@@ -175,11 +197,6 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
yield "".join(words)
|
||||
|
||||
|
||||
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
||||
"""Brute force check all vocab words"""
|
||||
yield from vocab
|
||||
|
||||
|
||||
def generator_random_chars(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text with simple characters"""
|
||||
|
||||
@@ -274,8 +291,8 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g
|
||||
ids2 = func_tokenize2(text)
|
||||
if ids1 != ids2:
|
||||
i = find_first_mismatch(ids1, ids2)
|
||||
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
|
||||
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
|
||||
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
|
||||
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
|
||||
logger.info(" TokenIDs: " + str(ids1))
|
||||
logger.info(" Expected: " + str(ids2))
|
||||
raise Exception()
|
||||
@@ -309,8 +326,9 @@ def main(argv: list[str] = None):
|
||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
|
||||
@@ -322,14 +340,14 @@ def main(argv: list[str] = None):
|
||||
if __name__ == "__main__":
|
||||
# main()
|
||||
|
||||
path_tokenizers = "./models/tokenizers/"
|
||||
path_tokenizers = "./models/tokenizers/"
|
||||
path_vocab_format = "./models/ggml-vocab-%s.gguf"
|
||||
|
||||
# import os
|
||||
# tokenizers = os.listdir(path_tokenizers)
|
||||
tokenizers = [
|
||||
# "llama-spm", # SPM
|
||||
# "phi-3", # SPM
|
||||
"llama-spm", # SPM
|
||||
"phi-3", # SPM
|
||||
"jina-v2-en", # WPM
|
||||
"bert-bge", # WPM
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user