mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-21 08:24:06 +00:00
Compare commits
39 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dcdcee3a74 | ||
|
|
1f4111e540 | ||
|
|
befaf1197f | ||
|
|
feff4aa846 | ||
|
|
0abc6a2c25 | ||
|
|
bd35cb0ae3 | ||
|
|
78203641fe | ||
|
|
e6b7801bd1 | ||
|
|
e665744317 | ||
|
|
d4c3c10fad | ||
|
|
2a825116b6 | ||
|
|
4dc4f5f14a | ||
|
|
c837981bba | ||
|
|
3c26a1644d | ||
|
|
ff76e18516 | ||
|
|
39f852f440 | ||
|
|
2b00fa7997 | ||
|
|
d6a04f872d | ||
|
|
c9c8575a1a | ||
|
|
df4b7945ae | ||
|
|
449ccfb6f5 | ||
|
|
1b28061400 | ||
|
|
8db003a19d | ||
|
|
0996c5597f | ||
|
|
5bb2c5dbd2 | ||
|
|
67155ab7f5 | ||
|
|
5af118efda | ||
|
|
d2b496bff4 | ||
|
|
b34e023480 | ||
|
|
51b6038636 | ||
|
|
cb9c933eb2 | ||
|
|
6cd4e03444 | ||
|
|
8d300bd35f | ||
|
|
49006c67b4 | ||
|
|
00ba2ff781 | ||
|
|
83008b7cfe | ||
|
|
0b4ac75772 | ||
|
|
fb3f249815 | ||
|
|
bfe76d4a17 |
16
.github/workflows/build.yml
vendored
16
.github/workflows/build.yml
vendored
@@ -375,7 +375,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -401,7 +401,7 @@ jobs:
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
@@ -442,7 +442,7 @@ jobs:
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
@@ -546,7 +546,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -576,7 +576,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -610,7 +610,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -969,14 +969,14 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
1
.github/workflows/server.yml
vendored
1
.github/workflows/server.yml
vendored
@@ -173,6 +173,7 @@ jobs:
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
|
||||
- name: Slow tests
|
||||
|
||||
@@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
|
||||
# determining _precisely_ which defines are necessary for the llama-config
|
||||
# package.
|
||||
#
|
||||
set(GGML_TRANSIENT_DEFINES)
|
||||
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
||||
if (GGML_DIR_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
||||
if (GGML_TARGET_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
|
||||
17
Makefile
17
Makefile
@@ -434,7 +434,7 @@ endif
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
|
||||
ifndef RISCV
|
||||
ifndef RISCV_CROSS_COMPILE
|
||||
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
@@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
||||
MK_CXXFLAGS += -mlasx
|
||||
endif
|
||||
|
||||
else
|
||||
ifneq ($(filter riscv64%,$(UNAME_M)),)
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
else # RISC-V CROSS COMPILATION
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
@@ -925,6 +930,7 @@ OBJ_LLAMA = \
|
||||
|
||||
OBJ_COMMON = \
|
||||
common/common.o \
|
||||
common/arg.o \
|
||||
common/console.o \
|
||||
common/ngram-cache.o \
|
||||
common/sampling.o \
|
||||
@@ -1157,6 +1163,11 @@ common/common.o: \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/arg.o: \
|
||||
common/arg.cpp \
|
||||
common/arg.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/sampling.o: \
|
||||
common/sampling.cpp \
|
||||
common/sampling.h \
|
||||
@@ -1429,6 +1440,7 @@ llama-server: \
|
||||
examples/server/system-prompts.js.hpp \
|
||||
examples/server/prompt-formats.js.hpp \
|
||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
common/json.hpp \
|
||||
common/stb_image.h \
|
||||
$(OBJ_ALL)
|
||||
@@ -1448,7 +1460,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
./llama-gen-docs
|
||||
|
||||
libllava.a: examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
|
||||
@@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||
|
||||
|
||||
@@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
|
||||
base64.hpp
|
||||
common.h
|
||||
common.cpp
|
||||
arg.h
|
||||
arg.cpp
|
||||
sampling.h
|
||||
sampling.cpp
|
||||
console.h
|
||||
|
||||
1995
common/arg.cpp
Normal file
1995
common/arg.cpp
Normal file
File diff suppressed because it is too large
Load Diff
77
common/arg.h
Normal file
77
common/arg.h
Normal file
@@ -0,0 +1,77 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
struct llama_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
bool is_sparam = false; // is current arg a sampling param?
|
||||
void (*handler_void) (gpt_params & params) = nullptr;
|
||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, int)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const char * value_hint_2,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
llama_arg & set_env(const char * env);
|
||||
llama_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
struct gpt_params_context {
|
||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||
gpt_params & params;
|
||||
std::vector<llama_arg> options;
|
||||
void(*print_usage)(int, char **) = nullptr;
|
||||
gpt_params_context(gpt_params & params) : params(params) {}
|
||||
};
|
||||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
1981
common/common.cpp
1981
common/common.cpp
File diff suppressed because it is too large
Load Diff
216
common/common.h
216
common/common.h
@@ -4,20 +4,11 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include "sampling.h"
|
||||
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
#include <functional>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
};
|
||||
|
||||
int32_t cpu_get_num_physical_cores();
|
||||
int32_t cpu_get_num_math();
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
// Common params
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
@@ -78,28 +78,72 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_LOOKUP,
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
enum gpt_sampler_type {
|
||||
GPT_SAMPLER_TYPE_NONE = 0,
|
||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
enum dimre_method {
|
||||
DIMRE_METHOD_PCA,
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
// sampler parameters
|
||||
struct gpt_sampler_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
GPT_SAMPLER_TYPE_TFS_Z,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||
GPT_SAMPLER_TYPE_TOP_P,
|
||||
GPT_SAMPLER_TYPE_MIN_P,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
|
||||
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
@@ -143,23 +187,23 @@ struct gpt_params {
|
||||
|
||||
struct gpt_sampler_params sparams;
|
||||
|
||||
std::string model = ""; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_token = ""; // HF token
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
@@ -189,7 +233,6 @@ struct gpt_params {
|
||||
|
||||
bool kl_divergence = false; // compute KL divergence
|
||||
|
||||
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
|
||||
bool usage = false; // print usage
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool special = false; // enable special token output
|
||||
@@ -204,6 +247,7 @@ struct gpt_params {
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
@@ -220,7 +264,7 @@ struct gpt_params {
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
@@ -236,15 +280,15 @@ struct gpt_params {
|
||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
std::string public_path = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
std::string system_prompt = ""; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
std::string ssl_file_key = "";
|
||||
std::string ssl_file_cert = "";
|
||||
std::string ssl_file_key = ""; // NOLINT
|
||||
std::string ssl_file_cert = ""; // NOLINT
|
||||
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_metrics = false;
|
||||
@@ -299,92 +343,6 @@ struct gpt_params {
|
||||
bool batched_bench_output_jsonl = false;
|
||||
};
|
||||
|
||||
struct llama_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
void (*handler_void) (gpt_params & params) = nullptr;
|
||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, int)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const char * value_hint_2,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = std::move(examples);
|
||||
return *this;
|
||||
}
|
||||
|
||||
llama_arg & set_env(const char * env) {
|
||||
help = help + "\n(env: " + env + ")";
|
||||
this->env = env;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool in_example(enum llama_example ex) {
|
||||
return examples.find(ex) != examples.end();
|
||||
}
|
||||
|
||||
bool get_value_from_env(std::string & output) const {
|
||||
if (env == nullptr) return false;
|
||||
char * value = std::getenv(env);
|
||||
if (value) {
|
||||
output = value;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_value_from_env() const {
|
||||
return env != nullptr && std::getenv(env);
|
||||
}
|
||||
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
// initialize list of options (arguments) that can be used by the current example
|
||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
|
||||
// optionally, we can provide "print_usage" to print example usage
|
||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
|
||||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||
|
||||
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
|
||||
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
|
||||
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
template<typename T>
|
||||
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = false; // TODO: control via params
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
auto * result = new gpt_sampler {
|
||||
/* .params = */ params,
|
||||
@@ -254,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||
// TODO: measure grammar performance
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -307,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
|
||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
||||
return llama_sampler_get_seed(gsmpl->chain);
|
||||
}
|
||||
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||
@@ -420,7 +427,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
||||
}
|
||||
|
||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
|
||||
@@ -2,61 +2,11 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
enum gpt_sampler_type {
|
||||
GPT_SAMPLER_TYPE_NONE = 0,
|
||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
struct gpt_sampler_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
GPT_SAMPLER_TYPE_TFS_Z,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||
GPT_SAMPLER_TYPE_TOP_P,
|
||||
GPT_SAMPLER_TYPE_MIN_P,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
// gpt_sampler extends llama_sampler with additional functionality:
|
||||
//
|
||||
// - grammar support
|
||||
@@ -110,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||
//
|
||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
||||
|
||||
// helpers
|
||||
|
||||
// access the internal list of current candidate tokens
|
||||
|
||||
@@ -302,6 +302,8 @@ class Model:
|
||||
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||
)
|
||||
)
|
||||
or not new_name.endswith(".weight")
|
||||
@@ -624,6 +626,9 @@ class Model:
|
||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
||||
res = "exaone"
|
||||
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
||||
# ref: https://huggingface.co/microsoft/phi-2
|
||||
res = "phi-2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2769,6 +2774,8 @@ class Rwkv6Model(Model):
|
||||
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
|
||||
@@ -31,6 +31,7 @@ import re
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
|
||||
from hashlib import sha256
|
||||
from enum import IntEnum, auto
|
||||
@@ -97,6 +98,7 @@ models = [
|
||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||
]
|
||||
|
||||
|
||||
@@ -125,12 +127,27 @@ def download_model(model):
|
||||
if tokt == TOKENIZER_TYPE.UGM:
|
||||
files.append("spiece.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
if os.path.isdir(repo):
|
||||
# If repo is a path on the file system, copy the directory
|
||||
for file in files:
|
||||
src_path = os.path.join(repo, file)
|
||||
dst_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(dst_path):
|
||||
logger.info(f"{name}: File {dst_path} already exists - skipping")
|
||||
continue
|
||||
if os.path.isfile(src_path):
|
||||
shutil.copy2(src_path, dst_path)
|
||||
logger.info(f"{name}: Copied {src_path} to {dst_path}")
|
||||
else:
|
||||
logger.warning(f"{name}: Source file {src_path} does not exist")
|
||||
else:
|
||||
# If repo is a URL, download the files
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
|
||||
|
||||
for model in models:
|
||||
|
||||
@@ -363,7 +363,13 @@ if __name__ == '__main__':
|
||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
dest = super().modify_tensors(data_torch, name, bid)
|
||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||
# therefore, we ignore them for now
|
||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
||||
@@ -1,33 +1,12 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// mutates the input string
|
||||
static std::vector<int> parse_list(char * p) {
|
||||
std::vector<int> ret;
|
||||
|
||||
char * q = p;
|
||||
|
||||
while (*p) {
|
||||
if (*p == ',') {
|
||||
*p = '\0';
|
||||
ret.push_back(std::atoi(q));
|
||||
q = p + 1;
|
||||
}
|
||||
|
||||
++p;
|
||||
}
|
||||
|
||||
ret.push_back(std::atoi(q));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
@@ -37,8 +16,7 @@ static void print_usage(int, char ** argv) {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -209,7 +187,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
|
||||
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
|
||||
|
||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||
|
||||
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
|
||||
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
|
||||
llama_perf_sampler_print(smpl)
|
||||
llama_perf_context_print(context)
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let utf8Count = text.utf8.count
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_sampler_print(smpl);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
||||
@@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(gf->nodes[0]);
|
||||
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||
|
||||
@@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
||||
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
printf("=====================================================================================\n");
|
||||
@@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
||||
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
@@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,12 +12,9 @@
|
||||
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#define DEBUG_POS 5
|
||||
|
||||
@@ -229,8 +226,8 @@ static ggml_status compute_piter(
|
||||
result.eigenvectors.resize(params.n_batch);
|
||||
result.distances.resize(params.n_batch);
|
||||
// get output nodes
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
auto node = gf->nodes[i];
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||
auto node = ggml_graph_node(gf, i);
|
||||
int iter = -1;
|
||||
// find b_tensor (without copying data from device)
|
||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -90,8 +90,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@@ -308,7 +306,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
// clean up
|
||||
llama_batch_free(batch);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
@@ -144,8 +145,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
@@ -369,7 +370,7 @@ struct lora_merge_ctx {
|
||||
|
||||
// write data to output file
|
||||
{
|
||||
auto result = gf->nodes[gf->n_nodes - 1];
|
||||
auto * result = ggml_graph_node(gf, -1);
|
||||
size_t len = ggml_nbytes(result);
|
||||
if (read_buf.size() < len) {
|
||||
read_buf.resize(len);
|
||||
@@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <fstream>
|
||||
@@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
|
||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||
|
||||
gpt_params params;
|
||||
auto options = gpt_params_parser_init(params, ex);
|
||||
auto ctx_arg = gpt_params_parser_init(params, ex);
|
||||
|
||||
file << "| Argument | Explanation |\n";
|
||||
file << "| -------- | ----------- |\n";
|
||||
for (auto & opt : options) {
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
file << "| `";
|
||||
// args
|
||||
for (const auto & arg : opt.args) {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -153,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
||||
int main(int argc, char * argv[]) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
|
||||
params.logits_all = true;
|
||||
params.verbosity = 1;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
|
||||
g_collector.save_imatrix();
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -159,8 +159,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -301,6 +299,9 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
smpl = gpt_sampler_init(model, sparams);
|
||||
|
||||
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
@@ -340,8 +341,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
smpl = gpt_sampler_init(model, sparams);
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
|
||||
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
|
||||
fflush(p_err->fout);
|
||||
}
|
||||
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
|
||||
```
|
||||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
|
||||
```sh
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||
|
||||
@@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#include "ggml.h"
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include "base64.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
@@ -278,8 +279,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
|
||||
@@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||
ggml_build_forward_expand(gf, flatten);
|
||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#include "ggml.h"
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
@@ -16,8 +18,8 @@ struct llava_context {
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
@@ -253,8 +255,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -318,7 +319,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
@@ -36,8 +38,7 @@ struct ngram_container {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
@@ -13,8 +14,7 @@
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -40,4 +40,6 @@ int main(int argc, char ** argv){
|
||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||
|
||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#include "ggml.h"
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
@@ -15,8 +16,7 @@
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#include "arg.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
@@ -12,8 +14,7 @@
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -239,8 +240,7 @@ int main(int argc, char ** argv){
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ntarget:\n\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
gpt_perf_print(ctx, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -192,8 +191,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -471,8 +468,10 @@ int main(int argc, char ** argv) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
|
||||
LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
||||
LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
// group-attention state
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
// A basic application simulating a server with multiple clients.
|
||||
// The clients submit requests to the server and they are processed in parallel.
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
@@ -100,8 +102,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -414,7 +415,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
// TODO: print sampling/grammar timings for all clients
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
|
||||
params.n_keep = 32;
|
||||
params.i_pos = -1;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
|
||||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@@ -2049,7 +2047,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
write_logfile(ctx, params, model, results);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
set(TARGET llama-quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
// clean up
|
||||
llama_batch_free(query_batch);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "The quick brown fox";
|
||||
params.sparams.seed = 1234;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ set(PUBLIC_ASSETS
|
||||
system-prompts.js
|
||||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
loading.html
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
|
||||
@@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--version` | show version and build info |
|
||||
| `-v, --verbose` | print verbose information |
|
||||
| `--verbosity N` | set specific verbosity level (default: 0) |
|
||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||
| `--no-display-prompt` | don't print prompt at generation (default: false) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
|
||||
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
|
||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
| `--no-escape` | do not process escape sequences |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||
@@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
||||
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||
@@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
@@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||
| `--timeout N` | server read/write timeout in seconds (default: 600) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
||||
@@ -412,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
*Options:*
|
||||
|
||||
`content`: Set the text to tokenize.
|
||||
`content`: (Required) The text to tokenize.
|
||||
|
||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
|
||||
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
|
||||
|
||||
**Response:**
|
||||
|
||||
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
|
||||
|
||||
|
||||
If `with_pieces` is `false`:
|
||||
```json
|
||||
{
|
||||
"tokens": [123, 456, 789]
|
||||
}
|
||||
```
|
||||
|
||||
If `with_pieces` is `true`:
|
||||
```json
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 123, "piece": "Hello"},
|
||||
{"id": 456, "piece": " world"},
|
||||
{"id": 789, "piece": "!"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
||||
```json
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 198, "piece": [195]}, // hex C3
|
||||
{"id": 164, "piece": [161]} // hex A1
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### POST `/detokenize`: Convert tokens to text
|
||||
|
||||
|
||||
12
examples/server/public/loading.html
Normal file
12
examples/server/public/loading.html
Normal file
@@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="refresh" content="5">
|
||||
</head>
|
||||
<body>
|
||||
<div id="loading">
|
||||
The model is loading. Please wait.<br/>
|
||||
The user interface will appear soon.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "utils.hpp"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -26,6 +28,7 @@
|
||||
#include "system-prompts.js.hpp"
|
||||
#include "prompt-formats.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
#include "loading.html.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
@@ -1264,6 +1267,7 @@ struct server_context {
|
||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.sparams.seed},
|
||||
{"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||
@@ -2423,8 +2427,7 @@ int main(int argc, char ** argv) {
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2590,10 +2593,16 @@ int main(int argc, char ** argv) {
|
||||
return false;
|
||||
};
|
||||
|
||||
auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
|
||||
auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
|
||||
server_state current_state = state.load();
|
||||
if (current_state == SERVER_STATE_LOADING_MODEL) {
|
||||
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||
auto tmp = string_split(req.path, '.');
|
||||
if (req.path == "/" || tmp.back() == "html") {
|
||||
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
||||
res.status = 503;
|
||||
} else {
|
||||
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -2984,6 +2993,8 @@ int main(int argc, char ** argv) {
|
||||
}, [&](json error_data) {
|
||||
server_sent_event(sink, "error", error_data);
|
||||
});
|
||||
static const std::string ev_done = "data: [DONE]\n\n";
|
||||
sink.write(ev_done.data(), ev_done.size());
|
||||
sink.done();
|
||||
return true;
|
||||
};
|
||||
@@ -3011,12 +3022,39 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
json tokens_response = json::array();
|
||||
if (body.count("content") != 0) {
|
||||
const bool add_special = json_value(body, "add_special", false);
|
||||
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||
|
||||
if (with_pieces) {
|
||||
for (const auto& token : tokens) {
|
||||
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
|
||||
json piece_json;
|
||||
|
||||
// Check if the piece is valid UTF-8
|
||||
if (is_valid_utf8(piece)) {
|
||||
piece_json = piece;
|
||||
} else {
|
||||
// If not valid UTF-8, store as array of byte values
|
||||
piece_json = json::array();
|
||||
for (unsigned char c : piece) {
|
||||
piece_json.push_back(static_cast<int>(c));
|
||||
}
|
||||
}
|
||||
|
||||
tokens_response.push_back({
|
||||
{"id", token},
|
||||
{"piece", piece_json}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
tokens_response = tokens;
|
||||
}
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
|
||||
const json data = format_tokenizer_response(tokens_response);
|
||||
res_ok(res, data);
|
||||
};
|
||||
|
||||
|
||||
@@ -105,6 +105,14 @@ Feature: llama.cpp server
|
||||
Given first token is removed
|
||||
Then tokens can be detokenized
|
||||
|
||||
Scenario: Tokenize with pieces
|
||||
When tokenizing with pieces:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
媽
|
||||
"""
|
||||
Then tokens are given with pieces
|
||||
|
||||
Scenario: Models available
|
||||
Given available models
|
||||
Then 1 models are supported
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
@@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
|
||||
context.tokenize_add_special = True
|
||||
|
||||
|
||||
@step("tokenizing with pieces")
|
||||
@async_run_until_complete
|
||||
async def step_tokenize_with_pieces(context):
|
||||
context.tokenized_text = context_text(context)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
|
||||
if getattr(context, "tokenize_add_special", None) is not None:
|
||||
tokenize_args["add_special"] = context.tokenize_add_special
|
||||
|
||||
async with session.post(
|
||||
f"{context.base_url}/tokenize", json=tokenize_args
|
||||
) as response:
|
||||
assert response.status == 200
|
||||
tokenize_json = await response.json()
|
||||
context.tokens_with_pieces = tokenize_json["tokens"]
|
||||
|
||||
|
||||
@step("tokens are given with pieces")
|
||||
@async_run_until_complete
|
||||
async def step_tokenize_with_pieces(context):
|
||||
# Verify that the response contains both token IDs and pieces
|
||||
assert all(
|
||||
"id" in token and "piece" in token for token in context.tokens_with_pieces
|
||||
)
|
||||
|
||||
|
||||
@step('tokenizing')
|
||||
@async_run_until_complete
|
||||
async def step_tokenize(context):
|
||||
@@ -991,6 +1020,8 @@ async def oai_chat_completions(user_prompt,
|
||||
event_data = line.split(': ', 1)
|
||||
assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
|
||||
chunk_raw = event_data[1]
|
||||
if chunk_raw == '[DONE]':
|
||||
break
|
||||
|
||||
chunk = json.loads(chunk_raw)
|
||||
assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
|
||||
|
||||
@@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
||||
return res;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
||||
static bool is_valid_utf8(const std::string & str) {
|
||||
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
||||
const unsigned char* end = bytes + str.length();
|
||||
|
||||
while (bytes < end) {
|
||||
if (*bytes <= 0x7F) {
|
||||
// 1-byte sequence (0xxxxxxx)
|
||||
bytes++;
|
||||
} else if ((*bytes & 0xE0) == 0xC0) {
|
||||
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 2;
|
||||
} else if ((*bytes & 0xF0) == 0xE0) {
|
||||
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 3;
|
||||
} else if ((*bytes & 0xF8) == 0xF0) {
|
||||
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
||||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 4;
|
||||
} else {
|
||||
// Invalid UTF-8 lead byte
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const json & tokens) {
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_sampler_print(smpl);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <random>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
@@ -27,8 +29,7 @@ struct seq_draft {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -615,7 +616,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_TEE("\ndraft:\n\n");
|
||||
// TODO: print sampling/grammar timings for all drafts
|
||||
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_dft);
|
||||
|
||||
LOG_TEE("\ntarget:\n\n");
|
||||
gpt_perf_print(ctx_tgt, smpl);
|
||||
|
||||
@@ -4,33 +4,23 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
GGML_SYCL_SINGLE_GPU=1
|
||||
else
|
||||
GGML_SYCL_DEVICE=0
|
||||
GGML_SYCL_SINGLE_GPU=0
|
||||
fi
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=llama-2-7b.Q4_0.gguf
|
||||
NGL=33
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
20
flake.lock
generated
20
flake.lock
generated
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1725024810,
|
||||
"narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
|
||||
"lastModified": 1725234343,
|
||||
"narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
|
||||
"rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1724819573,
|
||||
"narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
|
||||
"lastModified": 1725634671,
|
||||
"narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
|
||||
"rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -36,14 +36,14 @@
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1722555339,
|
||||
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
|
||||
"lastModified": 1725233747,
|
||||
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
||||
@@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||
*/
|
||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||
*
|
||||
* @return A pointer to the host buffer type interface.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
*
|
||||
|
||||
@@ -358,6 +358,7 @@ extern "C" {
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
struct ggml_cgraph;
|
||||
|
||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||
enum ggml_type {
|
||||
@@ -575,23 +576,9 @@ extern "C" {
|
||||
GGML_TENSOR_FLAG_PARAM = 4,
|
||||
};
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_type type;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
@@ -655,7 +642,7 @@ extern "C" {
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
@@ -671,35 +658,6 @@ extern "C" {
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
@@ -2017,8 +1975,6 @@ extern "C" {
|
||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||
|
||||
#define GGML_N_TASKS_MAX -1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -2088,30 +2044,35 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||
|
||||
// graph allocation in a context
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
@@ -2509,6 +2470,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
@@ -26,6 +26,8 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
unset(GGML_EXTRA_LIBS)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
@@ -35,7 +37,7 @@ if (APPLE AND GGML_ACCELERATE)
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
||||
list(APPEND GGML_EXTRA_LIBS ${ACCELERATE_FRAMEWORK})
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
@@ -87,7 +89,7 @@ if (GGML_METAL)
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
)
|
||||
|
||||
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
|
||||
list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
|
||||
else()
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
@@ -132,7 +134,7 @@ if (GGML_METAL)
|
||||
)
|
||||
endif() # GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
|
||||
list(APPEND GGML_EXTRA_LIBS
|
||||
${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK}
|
||||
${METALKIT_FRAMEWORK}
|
||||
@@ -157,11 +159,11 @@ if (GGML_OPENMP)
|
||||
|
||||
add_compile_definitions(GGML_USE_OPENMP)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
list(APPEND GGML_EXTRA_LIBS OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
|
||||
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
|
||||
list(APPEND GGML_EXTRA_LIBS "/usr/lib/llvm-10/lib/libomp.so")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "OpenMP not found")
|
||||
@@ -244,8 +246,8 @@ if (GGML_BLAS)
|
||||
set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
|
||||
set(GGML_SOURCES_BLAS ggml-blas.cpp)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||
list(APPEND GGML_EXTRA_LIBS ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(WARNING "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
@@ -368,19 +370,19 @@ if (GGML_CUDA)
|
||||
if (GGML_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
else ()
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
|
||||
list(APPEND GGML_EXTRA_LIBS MUSA::musart_static MUSA::mublas_static)
|
||||
else()
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
|
||||
list(APPEND GGML_EXTRA_LIBS MUSA::musart MUSA::mublas)
|
||||
else()
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
list(APPEND GGML_EXTRA_LIBS CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -388,9 +390,9 @@ if (GGML_CUDA)
|
||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||
else()
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
|
||||
list(APPEND GGML_EXTRA_LIBS MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
|
||||
else()
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||
list(APPEND GGML_EXTRA_LIBS CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
@@ -495,7 +497,7 @@ if (GGML_HIPBLAS)
|
||||
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
|
||||
list(APPEND GGML_EXTRA_LIBS hip::device)
|
||||
else()
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||
endif()
|
||||
@@ -504,7 +506,8 @@ if (GGML_HIPBLAS)
|
||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||
endif()
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
# TODO: this "PUBLIC" here seems wrong
|
||||
list(APPEND GGML_EXTRA_LIBS PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
@@ -513,7 +516,8 @@ if (GGML_SYCL)
|
||||
endif()
|
||||
|
||||
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||
if ( DEFINED ENV{ONEAPI_ROOT})
|
||||
|
||||
if (DEFINED ENV{ONEAPI_ROOT})
|
||||
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
|
||||
elseif(SUPPORTS_SYCL)
|
||||
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
|
||||
@@ -551,21 +555,27 @@ if (GGML_SYCL)
|
||||
|
||||
find_package(DNNL)
|
||||
message("-- DNNL found:" ${DNNL_FOUND})
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
|
||||
else()
|
||||
add_compile_definitions(GGML_SYCL_DNNL=0)
|
||||
endif()
|
||||
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
list(APPEND GGML_EXTRA_LIBS IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
else()
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
list(APPEND GGML_EXTRA_LIBS OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
||||
list(APPEND GGML_EXTRA_LIBS pthread m dl onemkl)
|
||||
endif()
|
||||
endif()
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
@@ -579,7 +589,7 @@ if (GGML_RPC)
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
|
||||
|
||||
if (WIN32)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
|
||||
list(APPEND GGML_EXTRA_LIBS ws2_32)
|
||||
endif()
|
||||
|
||||
set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
|
||||
@@ -657,8 +667,8 @@ if (GGML_VULKAN)
|
||||
set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
list(APPEND GGML_EXTRA_LIBS Vulkan::Vulkan)
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
endif()
|
||||
@@ -817,8 +827,8 @@ if (GGML_KOMPUTE)
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
list(APPEND GGML_EXTRA_LIBS kompute)
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
@@ -883,9 +893,10 @@ if (GGML_CANN)
|
||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
||||
set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64)
|
||||
list(APPEND GGML_EXTRA_LIBS ${CANN_LIBRARIES} )
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS})
|
||||
list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64)
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
||||
endif()
|
||||
else()
|
||||
@@ -1322,12 +1333,14 @@ if (EMSCRIPTEN)
|
||||
set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
|
||||
target_include_directories(ggml PUBLIC ../include)
|
||||
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
|
||||
target_include_directories(ggml PUBLIC ../include)
|
||||
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
|
||||
target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
|
||||
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
|
||||
target_compile_features (ggml PRIVATE c_std_11) # don't bump
|
||||
|
||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS)
|
||||
|
||||
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
|
||||
|
||||
find_library(MATH_LIBRARY m)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cann/aclnn_ops.h"
|
||||
#include "ggml-cann/common.h"
|
||||
@@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||
return &ggml_backend_cann_buffer_types[device];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the name associated with a CANN host buffer type.
|
||||
*
|
||||
* This function returns the descriptive name associated with the specified
|
||||
* CANN host buffer type context.
|
||||
*
|
||||
* @param buft Pointer to the host buffer type context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the name associated with a CANN host buffer.
|
||||
*
|
||||
* This function returns the descriptive name associated with the specified
|
||||
* CANN host buffer context.
|
||||
*
|
||||
* @param buft Pointer to the host buffer context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free resources associated with a CANN host buffer.
|
||||
*
|
||||
* This function frees the resources associated with a CANN host buffer, including
|
||||
* its context.
|
||||
*
|
||||
* @param buffer The CANN host buffer to free.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates a new CANN host buffer of the specified size.
|
||||
*
|
||||
* This function allocates a new CANN host buffer with the given size.
|
||||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
|
||||
*/
|
||||
static void * ggml_cann_host_malloc(size_t size) {
|
||||
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void * hostPtr = nullptr;
|
||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||
if (err != ACL_SUCCESS) {
|
||||
|
||||
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||
return nullptr;
|
||||
}
|
||||
return hostPtr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates a new CANN host buffer of the specified type and size.
|
||||
*
|
||||
* @param buft Pointer to the host buffer type context.
|
||||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * hostPtr = ggml_cann_host_malloc(size);
|
||||
|
||||
if (hostPtr == nullptr) {
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Interface for managing CANN host buffer types in the GGML backend.
|
||||
*
|
||||
* Provides function pointers for allocating, querying properties, and managing
|
||||
* memory for CANN buffer types in the GGML backend.
|
||||
*/
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_cann_buffer_type_host;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Computes the forward operation for a given tensor using CANN
|
||||
* operations.
|
||||
@@ -1942,7 +2053,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_cann_set_device(ctx->device);
|
||||
ggml_backend_t cann_backend =
|
||||
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
||||
/* .interface = */ ggml_backend_cann_interface,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
||||
@@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
||||
@@ -2742,6 +2742,7 @@ struct mmq_args {
|
||||
int64_t ne00; int64_t ne01; int64_t stride01;
|
||||
int64_t ne10; int64_t ne11; int64_t stride11;
|
||||
int64_t ne0;
|
||||
bool use_stream_k;
|
||||
};
|
||||
|
||||
template<ggml_type type>
|
||||
@@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, 1);
|
||||
|
||||
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
|
||||
if (!use_stream_k) {
|
||||
if (!args.use_stream_k) {
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
|
||||
|
||||
39
ggml/src/ggml-cuda/vendors/musa.h
vendored
39
ggml/src/ggml-cuda/vendors/musa.h
vendored
@@ -130,42 +130,3 @@
|
||||
#define cudaKernelNodeParams musaKernelNodeParams
|
||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
|
||||
// XXX: Clang builtins mapping
|
||||
#define __vsub4 __vsub4_musa
|
||||
#define __vcmpeq4 __vcmpeq4_musa
|
||||
#define __vcmpne4 __vcmpne4_musa
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||
@@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||
|
||||
@@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// computation graph
|
||||
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-kompute.h"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#import "ggml-metal.h"
|
||||
|
||||
#import "ggml-impl.h"
|
||||
#import "ggml-backend-impl.h"
|
||||
#import "ggml.h"
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
@@ -17,8 +17,8 @@
|
||||
#define GGML_METAL_LOG_WARN(...)
|
||||
#define GGML_METAL_LOG_ERROR(...)
|
||||
#else
|
||||
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
@@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// create multiple command buffers and enqueue them
|
||||
// then, we encode the graph into the command buffers in parallel
|
||||
|
||||
const int n_nodes = gf->n_nodes;
|
||||
const int n_nodes = gf->n_nodes;
|
||||
const int n_cb = ctx->n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
@@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
NSString * error_code = [command_buffer error].localizedDescription;
|
||||
GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
|
||||
GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "ggml-rpc.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cinttypes>
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
@@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_CLAMP:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_IM2COL:
|
||||
// TODO: add support for the new F32 operations
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-vulkan-shaders.hpp"
|
||||
|
||||
120
ggml/src/ggml.c
120
ggml/src/ggml.c
@@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
#define GGML_N_TASKS_MAX (-1)
|
||||
|
||||
#define GGML_SOFT_MAX_UNROLL 4
|
||||
#define GGML_VEC_DOT_UNROLL 2
|
||||
@@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
#define GGML_F32x4_ADD vaddq_f32
|
||||
#define GGML_F32x4_MUL vmulq_f32
|
||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
@@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||
#define GGML_F16x8_ADD vaddq_f16
|
||||
#define GGML_F16x8_MUL vmulq_f16
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
@@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
||||
|
||||
//
|
||||
// ggml object
|
||||
//
|
||||
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
//
|
||||
// ggml context
|
||||
//
|
||||
@@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||
}
|
||||
|
||||
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->size;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||
if (i < 0) {
|
||||
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||
return cgraph->nodes[cgraph->n_nodes + i];
|
||||
}
|
||||
|
||||
GGML_ASSERT(i < cgraph->n_nodes);
|
||||
return cgraph->nodes[i];
|
||||
}
|
||||
|
||||
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->nodes;
|
||||
}
|
||||
|
||||
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->n_nodes;
|
||||
}
|
||||
|
||||
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
|
||||
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||
cgraph->n_nodes++;
|
||||
}
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__gnu_linux__)
|
||||
static void set_numa_thread_affinity(int thread_n) {
|
||||
@@ -23242,6 +23288,14 @@ int ggml_cpu_has_arm_fma(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_riscv_v(void) {
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_metal(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
|
||||
@@ -343,7 +343,7 @@ extern "C" {
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
//bool no_perf; // whether to measure performance timings, TODO: implement
|
||||
bool no_perf; // whether to measure performance timings
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -1056,6 +1056,9 @@ extern "C" {
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
||||
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
||||
|
||||
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
|
||||
|
||||
// available samplers:
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
||||
@@ -1127,6 +1130,10 @@ extern "C" {
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias);
|
||||
|
||||
|
||||
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
||||
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
||||
|
||||
/// @details Sample and accept a token from the idx-th output of the last evaluation
|
||||
//
|
||||
// Shorthand for:
|
||||
@@ -1169,13 +1176,30 @@ extern "C" {
|
||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
//
|
||||
|
||||
enum llama_perf_type {
|
||||
LLAMA_PERF_TYPE_CONTEXT = 0,
|
||||
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
||||
struct llama_perf_context_data {
|
||||
double t_start_ms;
|
||||
double t_load_ms;
|
||||
double t_p_eval_ms;
|
||||
double t_eval_ms;
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
||||
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
||||
struct llama_perf_sampler_data {
|
||||
double t_sample_ms;
|
||||
|
||||
int32_t n_sample;
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
||||
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
||||
|
||||
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
||||
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <cfloat>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
@@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
cur_p->size = k;
|
||||
}
|
||||
|
||||
static uint32_t get_rng_seed(uint32_t seed) {
|
||||
if (seed == LLAMA_DEFAULT_SEED) {
|
||||
// use system clock if std::random_device is not a true RNG
|
||||
static bool is_rd_prng = std::random_device().entropy() == 0;
|
||||
if (is_rd_prng) {
|
||||
return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
|
||||
}
|
||||
std::random_device rd;
|
||||
return rd();
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
// llama_sampler API
|
||||
|
||||
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
||||
@@ -335,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
|
||||
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
||||
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
||||
|
||||
if (i < 0 || i >= (int32_t) p->samplers.size()) {
|
||||
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return p->samplers[i];
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
|
||||
auto * p = (llama_sampler_chain *) chain->ctx;
|
||||
|
||||
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * result = p->samplers[i];
|
||||
p->samplers.erase(p->samplers.begin() + i);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
||||
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
||||
|
||||
@@ -387,6 +414,7 @@ struct llama_sampler * llama_sampler_init_greedy() {
|
||||
|
||||
struct llama_sampler_dist {
|
||||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
std::mt19937 rng;
|
||||
};
|
||||
@@ -416,7 +444,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
|
||||
|
||||
static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
||||
ctx->rng = std::mt19937(ctx->seed);
|
||||
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||
ctx->rng.seed(ctx->seed_cur);
|
||||
}
|
||||
|
||||
static void llama_sampler_dist_free(struct llama_sampler * smpl) {
|
||||
@@ -433,11 +462,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_dist_i,
|
||||
/* .ctx = */ new llama_sampler_dist {
|
||||
/* .seed = */ seed,
|
||||
/* .rng = */ std::mt19937(seed),
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -1032,6 +1063,7 @@ struct llama_sampler_mirostat {
|
||||
const int32_t n_vocab;
|
||||
|
||||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
const float tau;
|
||||
const float eta;
|
||||
@@ -1100,7 +1132,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
|
||||
static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
|
||||
ctx->mu = 2.0f*ctx->tau;
|
||||
ctx->rng = std::mt19937(ctx->seed);
|
||||
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||
ctx->rng.seed(ctx->seed_cur);
|
||||
}
|
||||
|
||||
static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
|
||||
@@ -1117,16 +1150,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_mirostat_i,
|
||||
/* .ctx = */ new llama_sampler_mirostat {
|
||||
/* .n_vocab = */ n_vocab,
|
||||
/* .seed = */ seed,
|
||||
/* .tau = */ tau,
|
||||
/* .eta = */ eta,
|
||||
/* .m = */ m,
|
||||
/* .mu = */ 2.0f*tau,
|
||||
/* .rng = */ std::mt19937(seed),
|
||||
/* .n_vocab = */ n_vocab,
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .tau = */ tau,
|
||||
/* .eta = */ eta,
|
||||
/* .m = */ m,
|
||||
/* .mu = */ 2.0f*tau,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -1135,6 +1170,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
|
||||
|
||||
struct llama_sampler_mirostat_v2 {
|
||||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
const float tau;
|
||||
const float eta;
|
||||
@@ -1179,7 +1215,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
|
||||
static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
|
||||
ctx->mu = 2.0f*ctx->tau;
|
||||
ctx->rng = std::mt19937(ctx->seed);
|
||||
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||
ctx->rng.seed(ctx->seed_cur);
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
|
||||
@@ -1212,14 +1249,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_mirostat_v2_i,
|
||||
/* .ctx = */ new llama_sampler_mirostat_v2 {
|
||||
/* .seed = */ seed,
|
||||
/* .tau = */ tau,
|
||||
/* .eta = */ eta,
|
||||
/* .mu = */ 2.0f*tau,
|
||||
/* .rng = */ std::mt19937(seed),
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .tau = */ tau,
|
||||
/* .eta = */ eta,
|
||||
/* .mu = */ 2.0f*tau,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -1505,6 +1544,8 @@ struct llama_sampler * llama_sampler_init_penalties(
|
||||
ignore_eos = false;
|
||||
}
|
||||
|
||||
penalty_last_n = std::max(penalty_last_n, 0);
|
||||
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_penalties_i,
|
||||
/* .ctx = */ new llama_sampler_penalties {
|
||||
@@ -1568,6 +1609,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
|
||||
return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
|
||||
@@ -1599,3 +1641,65 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
||||
if (smpl->iface == &llama_sampler_dist_i) {
|
||||
return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
|
||||
}
|
||||
|
||||
if (smpl->iface == &llama_sampler_mirostat_i) {
|
||||
return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
|
||||
}
|
||||
|
||||
if (smpl->iface == &llama_sampler_mirostat_v2_i) {
|
||||
return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
|
||||
}
|
||||
|
||||
if (smpl->iface == &llama_sampler_chain_i) {
|
||||
const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
|
||||
for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
|
||||
const uint32_t seed = llama_sampler_get_seed(*it);
|
||||
if (seed != LLAMA_DEFAULT_SEED) {
|
||||
return seed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return LLAMA_DEFAULT_SEED;
|
||||
}
|
||||
|
||||
// perf
|
||||
|
||||
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
||||
struct llama_perf_sampler_data data = {};
|
||||
|
||||
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||
}
|
||||
|
||||
const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
|
||||
|
||||
data.t_sample_ms = 1e-3 * ctx->t_sample_us;
|
||||
data.n_sample = std::max(0, ctx->n_sample);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
||||
const auto data = llama_perf_sampler(chain);
|
||||
|
||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
||||
}
|
||||
|
||||
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
||||
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||
}
|
||||
|
||||
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
||||
|
||||
ctx->t_sample_us = ctx->n_sample = 0;
|
||||
}
|
||||
|
||||
168
src/llama.cpp
168
src/llama.cpp
@@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_sycl_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_CANN)
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_cann_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
@@ -2482,6 +2486,7 @@ struct llama_cparams {
|
||||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
@@ -6657,8 +6662,6 @@ static bool llm_load_tensors(
|
||||
bool use_mlock,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
model.t_start_us = ggml_time_us();
|
||||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
model.split_mode = split_mode;
|
||||
@@ -8589,14 +8592,13 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
model.t_start_us = ggml_time_us();
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||
|
||||
@@ -8658,6 +8660,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
return -1;
|
||||
}
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -9258,7 +9264,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
|
||||
// FIXME: zero-out NANs?
|
||||
states = ggml_mul(ctx, states, state_mask);
|
||||
|
||||
// copy states which won't be changed further (between n_seqs and n_rs)
|
||||
// copy states which won't be changed further (between n_seqs and n_kv)
|
||||
ggml_build_forward_expand(graph,
|
||||
ggml_cpy(ctx,
|
||||
ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
|
||||
@@ -9877,8 +9883,8 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||
// find result_norm tensor for input
|
||||
struct ggml_tensor * inp = nullptr;
|
||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||
inp = gf->nodes[i];
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
inp = ggml_graph_node(gf, i);
|
||||
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||
break;
|
||||
} else {
|
||||
@@ -15820,7 +15826,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
||||
|
||||
// clear unused states
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
uint32_t cell_id = i + kv_self.head;
|
||||
const uint32_t cell_id = i + kv_self.head;
|
||||
llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
|
||||
|
||||
data[i] = (float) (kv_cell.src >= 0);
|
||||
@@ -16076,19 +16082,21 @@ static int llama_decode_internal(
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
||||
|
||||
if (batch_all.token) {
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
||||
|
||||
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
||||
@@ -16205,8 +16213,8 @@ static int llama_decode_internal(
|
||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||
|
||||
// the output is always the last tensor in the graph
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
||||
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
||||
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
||||
|
||||
if (lctx.n_outputs == 0) {
|
||||
// no output
|
||||
@@ -16215,9 +16223,9 @@ static int llama_decode_internal(
|
||||
} else if (cparams.embeddings) {
|
||||
res = nullptr; // do not extract logits for embedding case
|
||||
embd = nullptr;
|
||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
||||
embd = gf->nodes[i];
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||
embd = ggml_graph_node(gf, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -16375,19 +16383,21 @@ static int llama_encode_internal(
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
if (batch.token) {
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
||||
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||
|
||||
@@ -16432,15 +16442,15 @@ static int llama_encode_internal(
|
||||
// there are two cases here
|
||||
if (llama_model_has_decoder(&lctx.model)) {
|
||||
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
||||
embd = gf->nodes[gf->n_nodes - 1];
|
||||
embd = ggml_graph_node(gf, -1);
|
||||
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
||||
} else {
|
||||
// second case is an encoder-only T5 model
|
||||
if (cparams.embeddings) {
|
||||
// only output embeddings if required
|
||||
embd = gf->nodes[gf->n_nodes - 1];
|
||||
embd = ggml_graph_node(gf, -1);
|
||||
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||
embd = gf->nodes[gf->n_nodes - 2];
|
||||
embd = ggml_graph_node(gf, -2);
|
||||
}
|
||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||
}
|
||||
@@ -17530,6 +17540,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
||||
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||
@@ -17939,6 +17951,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -18149,6 +18162,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
@@ -18486,7 +18500,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||
}
|
||||
}
|
||||
@@ -20067,10 +20081,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
||||
|
||||
// add the evaluation to the stats
|
||||
if (ctx->n_queued_tokens == 1) {
|
||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
if (!ctx->cparams.no_perf) {
|
||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
}
|
||||
ctx->n_eval++;
|
||||
} else if (ctx->n_queued_tokens > 1) {
|
||||
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
if (!ctx->cparams.no_perf) {
|
||||
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
}
|
||||
ctx->n_p_eval += ctx->n_queued_tokens;
|
||||
}
|
||||
|
||||
@@ -20666,6 +20684,7 @@ const char * llama_print_system_info(void) {
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
@@ -20677,65 +20696,40 @@ const char * llama_print_system_info(void) {
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
|
||||
switch (type) {
|
||||
case LLAMA_PERF_TYPE_CONTEXT:
|
||||
{
|
||||
const auto * p = (const struct llama_context *) ctx;
|
||||
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
||||
struct llama_perf_context_data data = {};
|
||||
|
||||
const double t_start_ms = 1e-3 * p->t_start_us;
|
||||
const double t_end_ms = 1.00 * ggml_time_ms();
|
||||
const double t_load_ms = 1e-3 * p->t_load_us;
|
||||
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
||||
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
||||
|
||||
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
||||
const int32_t n_eval = std::max(1, p->n_eval);
|
||||
|
||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
||||
} break;
|
||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
||||
{
|
||||
const auto * smpl = (const struct llama_sampler *) ctx;
|
||||
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
||||
|
||||
const double t_sampler_ms = 1e-3 * p->t_sample_us;
|
||||
|
||||
const int32_t n_sampler = std::max(0, p->n_sample);
|
||||
|
||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid perf type");
|
||||
if (ctx == nullptr) {
|
||||
return data;
|
||||
}
|
||||
|
||||
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
||||
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
||||
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
||||
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
||||
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
||||
data.n_eval = std::max(1, ctx->n_eval);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
|
||||
switch (type) {
|
||||
case LLAMA_PERF_TYPE_CONTEXT:
|
||||
{
|
||||
auto * p = (struct llama_context *) ctx;
|
||||
void llama_perf_context_print(const struct llama_context * ctx) {
|
||||
const auto data = llama_perf_context(ctx);
|
||||
|
||||
p->t_start_us = ggml_time_us();
|
||||
p->t_eval_us = p->n_eval = 0;
|
||||
p->t_p_eval_us = p->n_p_eval = 0;
|
||||
} break;
|
||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
||||
{
|
||||
auto * smpl = (struct llama_sampler *) ctx;
|
||||
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
||||
const double t_end_ms = 1e-3 * ggml_time_us();
|
||||
|
||||
p->t_sample_us = p->n_sample = 0;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid perf type");
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
}
|
||||
|
||||
void llama_perf_context_reset(struct llama_context * ctx) {
|
||||
ctx->t_start_us = ggml_time_us();
|
||||
ctx->t_eval_us = ctx->n_eval = 0;
|
||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
}
|
||||
|
||||
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
@@ -6,18 +9,16 @@
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int main(void) {
|
||||
gpt_params params;
|
||||
|
||||
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
||||
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
||||
try {
|
||||
auto options = gpt_params_parser_init(params, (enum llama_example)ex);
|
||||
auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
|
||||
std::unordered_set<std::string> seen_args;
|
||||
std::unordered_set<std::string> seen_env_vars;
|
||||
for (const auto & opt : options) {
|
||||
for (const auto & opt : ctx_arg.options) {
|
||||
// check for args duplications
|
||||
for (const auto & arg : opt.args) {
|
||||
if (seen_args.find(arg) == seen_args.end()) {
|
||||
@@ -52,40 +53,51 @@ int main(void) {
|
||||
};
|
||||
|
||||
std::vector<std::string> argv;
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
|
||||
printf("test-arg-parser: test invalid usage\n\n");
|
||||
|
||||
// missing value
|
||||
argv = {"binary_name", "-m"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
// wrong value (int)
|
||||
argv = {"binary_name", "-ngl", "hello"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
// wrong value (enum)
|
||||
argv = {"binary_name", "-sm", "hello"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||
|
||||
|
||||
printf("test-arg-parser: test valid usage\n\n");
|
||||
|
||||
argv = {"binary_name", "-m", "model_file.gguf"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "model_file.gguf");
|
||||
|
||||
argv = {"binary_name", "-t", "1234"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.cpuparams.n_threads == 1234);
|
||||
|
||||
argv = {"binary_name", "--verbose"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.verbosity == 1);
|
||||
|
||||
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "abc.gguf");
|
||||
assert(params.n_predict == 6789);
|
||||
assert(params.n_batch == 9090);
|
||||
|
||||
// --draft cannot be used outside llama-speculative
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||
assert(params.n_draft == 123);
|
||||
|
||||
// skip this part on windows, because setenv is not supported
|
||||
#ifdef _WIN32
|
||||
printf("test-arg-parser: skip on windows build\n");
|
||||
@@ -94,12 +106,12 @@ int main(void) {
|
||||
|
||||
setenv("LLAMA_ARG_THREADS", "blah", true);
|
||||
argv = {"binary_name"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
|
||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||
argv = {"binary_name"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "blah.gguf");
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
|
||||
@@ -109,7 +121,7 @@ int main(void) {
|
||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||
argv = {"binary_name", "-m", "overwritten.gguf"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||
assert(params.model == "overwritten.gguf");
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
#endif // _WIN32
|
||||
|
||||
@@ -519,7 +519,7 @@ struct test_case {
|
||||
|
||||
// add sentinels as graph nodes so that they are checked in the callback
|
||||
for (ggml_tensor * sentinel : sentinels) {
|
||||
gf->nodes[gf->n_nodes++] = sentinel;
|
||||
ggml_graph_add_node(gf, sentinel);
|
||||
}
|
||||
|
||||
// randomize tensors
|
||||
@@ -679,9 +679,9 @@ struct test_case {
|
||||
|
||||
// duplicate the op
|
||||
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||||
int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
|
||||
int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||
for (int i = 1; i < n_runs; i++) {
|
||||
gf->nodes[gf->n_nodes++] = out;
|
||||
ggml_graph_add_node(gf, out);
|
||||
}
|
||||
|
||||
// calculate memory
|
||||
@@ -696,11 +696,11 @@ struct test_case {
|
||||
}
|
||||
return size;
|
||||
};
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||
if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
|
||||
continue;
|
||||
}
|
||||
mem += tensor_op_size(gf->nodes[i]);
|
||||
mem += tensor_op_size(ggml_graph_node(gf, i));
|
||||
}
|
||||
|
||||
// run
|
||||
@@ -804,7 +804,7 @@ struct test_case {
|
||||
ggml_graph_cpy(gf, gb);
|
||||
ggml_build_backward_expand(ctx, gf, gb, false);
|
||||
if (expect.size() != 1 || expect[0] != 0.0f) {
|
||||
GGML_ASSERT(gb->n_nodes > gf->n_nodes);
|
||||
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user