Compare commits

...

17 Commits

Author SHA1 Message Date
Erik Scholz
698efad5fb CI: make the brew update temporarily optional. (#2092)
until they decide to fix the brew installation in the macos runners.
see the open issues. eg https://github.com/actions/runner-images/pull/7710
2023-07-04 01:50:12 +02:00
Govlzkoy
14a2cc71f6 [ggml] fix index for ne03 value in ggml_cl_mul_f32 (#2088) 2023-07-04 07:50:00 +08:00
Henri Vasserman
1cf14ccef1 fix server crashes (#2076) 2023-07-04 00:05:23 +03:00
Howard Su
cc45a7feb8 Fix crash of test-tokenizer-0 under Debug build (#2064)
* Fix crash of test-tokenizer-0 under Debug build

* Change per comment
2023-07-03 20:43:55 +02:00
Howard Su
55dbb915cc [llama] No need to check file version when loading vocab score (#2079) 2023-07-03 19:58:58 +08:00
WangHaoranRobin
d7d2e6a0f0 server: add option to output probabilities for completion (#1962)
* server: add option to output probabilities for completion
* server: fix issue when handling probability output for incomplete tokens for multibyte character generation
* server: fix llama_sample_top_k order
* examples/common.h: put all bool variables in gpt_params together
2023-07-03 00:38:44 +03:00
Georgi Gerganov
46088f7231 ggml : fix build with OpenBLAS (close #2066) 2023-07-02 09:46:46 +03:00
Johannes Gäßler
0bc2cdfc87 Better CUDA synchronization logic (#2057) 2023-07-01 21:49:44 +02:00
Johannes Gäßler
befb3a3562 Test-based VRAM scratch size + context adjustment (#2056) 2023-07-01 21:47:26 +02:00
Daniel Drake
b213227067 cmake : don't force -mcpu=native on aarch64 (#2063)
It's currently not possible to cross-compile llama.cpp for aarch64
because CMakeLists.txt forces -mcpu=native for that target.

-mcpu=native doesn't make sense if your build host is not the
target architecture, and clang rejects it for that reason, aborting the
build. This can be easily reproduced using the current Android NDK to build
for aarch64 on an x86_64 host.

If there is not a specific CPU-tuning target for aarch64 then -mcpu
should be omitted completely. I think that makes sense, there is not
enough variance in the aarch64 instruction set to warrant a fixed -mcpu
optimization at this point. And if someone is building natively and wishes
to enable any possible optimizations for the host device, then there is
already the LLAMA_NATIVE option available.

Fixes #495.
2023-07-01 21:31:44 +03:00
Aaron Miller
2f8cd979ec metal : release buffers when freeing metal context (#2062) 2023-07-01 21:14:59 +03:00
Judd
471aab6e4c convert : add support of baichuan-7b (#2055)
Co-authored-by: Judd <foldl@boxvest.com>
2023-07-01 20:00:25 +03:00
Georgi Gerganov
463f2f4c4f llama : fix return value of llama_load_session_file_internal (#2022) 2023-07-01 19:05:09 +03:00
Rand Xie
cb44dbc7de llama : catch llama_load_session_file_internal exceptions (#2022)
* convert checks in llama_load_session_file to throw and handle them

* make llama_load_session_file_internal static

* address feedbacks to avoid using exceptions
2023-07-01 19:02:58 +03:00
Georgi Gerganov
79f634a19d embd-input : fix returning ptr to temporary 2023-07-01 18:46:00 +03:00
Georgi Gerganov
04606a1599 train : fix compile warning 2023-07-01 18:45:44 +03:00
Qingyou Meng
b1ca8f36a9 ggml : disable GGML_TASK_INIT and GGML_TASK_FINALIZE by default (#1995)
Will not be scheduled unless explicitly enabled.
2023-07-01 18:42:43 +03:00
16 changed files with 332 additions and 91 deletions

View File

@@ -111,6 +111,7 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
@@ -129,6 +130,7 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update

View File

@@ -386,11 +386,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
if (MSVC)
# TODO: arm msvc?
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
add_compile_options(-mcpu=native)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)

View File

@@ -85,6 +85,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
**Bindings:**

View File

@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
return 1
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@dataclass
class Params:
@@ -321,6 +321,10 @@ class Tensor(metaclass=ABCMeta):
@abstractmethod
def permute(self, n_head: int) -> 'Tensor': ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod
def part(self, n_part: int) -> 'UnquantizedTensor': ...
@abstractmethod
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
@@ -345,6 +349,14 @@ class UnquantizedTensor(Tensor):
def to_ggml(self) -> 'UnquantizedTensor':
return self
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
def part(self, n_part: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head))
@@ -642,6 +654,19 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
return lazy_tensor.load().permute(n_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().part(n_part)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out: LazyModel = {}
@@ -650,11 +675,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out["output.weight"] = model["lm_head.weight"]
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
else:
break
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]

View File

@@ -31,7 +31,7 @@ struct gpt_params {
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
// sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs

View File

@@ -210,9 +210,12 @@ llama_token sampling_id(struct MyModel* mymodel) {
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
std::string ret;
if (id == llama_token_eos()) ret = "</s>";
else ret = llama_token_to_str(ctx, id);
static std::string ret;
if (id == llama_token_eos()) {
ret = "</s>";
} else {
ret = llama_token_to_str(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}

View File

@@ -5,7 +5,6 @@
#include "llama.h"
#include "build-info.h"
extern "C" {
typedef struct MyModel {
@@ -14,14 +13,13 @@ typedef struct MyModel {
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char* sampling(struct MyModel* mymodel);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);

View File

@@ -26,6 +26,17 @@ struct server_params {
int32_t write_timeout = 600;
};
// completion token output with probabilities
struct completion_token_output {
struct token_prob {
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
llama_token tok;
};
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
@@ -86,6 +97,40 @@ static void server_log(const char * level, const char * function, int line,
fflush(stdout);
}
// format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
// if first bit is 1, meaning it's a partial character
if (out.size() > 0 && (out[0] & 0x80) == 0x80) {
std::stringstream ss;
ss<< std::hex << (out[0] & 0xff);
std::string res ( ss.str() );
out = "byte: \\x" + res;
}
return out;
}
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> probs) {
json out = json::array();
for (const auto & prob : probs) {
json probs_for_token = json::array();
for (const auto & p : prob.probs) {
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
probs_for_token.push_back(json {
{ "tok_str", tok_str },
{ "prob", p.prob },
});
}
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json {
{"content", tok_str},
{"probs", probs_for_token},
});
}
return out;
}
static bool server_verbose = false;
#if SERVER_VERBOSE != 1
@@ -107,6 +152,7 @@ struct llama_server_context {
bool stream = false;
bool has_next_token = false;
std::string generated_text;
std::vector<completion_token_output> generated_token_probs;
size_t num_tokens_predicted = 0;
size_t n_past = 0;
@@ -142,6 +188,7 @@ struct llama_server_context {
num_tokens_predicted = 0;
generated_text = "";
generated_text.reserve(params.n_ctx);
generated_token_probs.clear();
truncated = false;
stopped_eos = false;
stopped_word = false;
@@ -221,8 +268,9 @@ struct llama_server_context {
llama_set_rng_seed(ctx, params.seed);
}
llama_token nextToken() {
llama_token result = -1;
completion_token_output nextToken() {
completion_token_output result;
result.tok = -1;
if (embd.size() >= (size_t)params.n_ctx) {
// Reset context
@@ -261,7 +309,8 @@ struct llama_server_context {
if (params.n_predict == 0) {
has_next_token = false;
return llama_token_eos();
result.tok = llama_token_eos();
return result;
}
// out of user input, sample next token
@@ -278,7 +327,7 @@ struct llama_server_context {
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
const int32_t n_probs = params.n_probs;
{
auto * logits = llama_get_logits(ctx);
@@ -312,35 +361,42 @@ struct llama_server_context {
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
result.tok = llama_sample_token_greedy(ctx, &candidates_p);
if (n_probs > 0) {
llama_sample_softmax(ctx, &candidates_p);
}
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
size_t min_keep = std::max(1, n_probs);
llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
result.tok = llama_sample_token(ctx, &candidates_p);
}
}
for (size_t i = 0; i < std::min(candidates_p.size, (size_t) n_probs); ++i) {
result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
}
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
last_n_tokens.push_back(result.tok);
num_tokens_predicted++;
}
// add it to the context
embd.push_back(id);
result = id;
embd.push_back(result.tok);
// decrement remaining sampling budget
--n_remain;
@@ -382,12 +438,16 @@ struct llama_server_context {
return stop_pos;
}
std::string doCompletion() {
const llama_token token = nextToken();
completion_token_output doCompletion() {
const completion_token_output token_with_probs = nextToken();
const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
generated_text += token_text;
if (params.n_probs > 0) {
generated_token_probs.push_back(token_with_probs);
}
if (multibyte_pending > 0) {
multibyte_pending -= token_text.size();
} else if (token_text.size() == 1) {
@@ -416,8 +476,8 @@ struct llama_server_context {
}
LOG_VERBOSE("next token", {
{ "token", token },
{ "token_text", llama_token_to_str(ctx, token) },
{ "token", token_with_probs.tok },
{ "token_text", tokens_to_output_formatted_string(ctx, token_with_probs.tok) },
{ "has_next_token", has_next_token },
{ "n_remain", n_remain },
{ "num_tokens_predicted", num_tokens_predicted },
@@ -427,7 +487,7 @@ struct llama_server_context {
{ "stopping_word", stopping_word },
});
return token_text;
return token_with_probs;
}
std::vector<float> getEmbedding() {
@@ -669,6 +729,7 @@ static json format_generation_settings(llama_server_context & llama) {
{ "ignore_eos", ignore_eos },
{ "stream", llama.stream },
{ "logit_bias", llama.params.logit_bias },
{ "n_probs", llama.params.n_probs },
};
}
@@ -678,8 +739,9 @@ static json format_embedding_response(llama_server_context & llama) {
};
}
static json format_final_response(llama_server_context & llama, const std::string & content) {
return json {
static json format_final_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
json res = json {
{ "content", content },
{ "stop", true },
{ "model", llama.params.model_alias },
@@ -692,13 +754,25 @@ static json format_final_response(llama_server_context & llama, const std::strin
{ "stopped_limit", llama.stopped_limit },
{ "stopping_word", llama.stopping_word },
};
if (llama.params.n_probs > 0) {
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
}
return res;
}
static json format_partial_response(const std::string & content) {
return json {
static json format_partial_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
json res = json {
{ "content", content },
{ "stop", false },
};
if (llama.params.n_probs > 0) {
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
}
return res;
}
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
@@ -728,6 +802,7 @@ static void parse_options_completion(const json & body, llama_server_context & l
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
llama.params.seed = body.value("seed", default_params.seed);
llama.params.prompt = body.value("prompt", default_params.prompt);
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
llama.params.logit_bias.clear();
if (body.value("ignore_eos", false)) {
@@ -830,7 +905,8 @@ int main(int argc, char ** argv) {
size_t stop_pos = std::string::npos;
while (llama.has_next_token) {
const std::string token_text = llama.doCompletion();
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
@@ -844,7 +920,7 @@ int main(int argc, char ** argv) {
llama.generated_text.end());
}
const json data = format_final_response(llama, llama.generated_text);
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
llama_print_timings(llama.ctx);
@@ -853,9 +929,11 @@ int main(int argc, char ** argv) {
} else {
const auto chunked_content_provider = [&](size_t, DataSink & sink) {
size_t sent_count = 0;
size_t sent_token_probs_index = 0;
while (llama.has_next_token) {
const std::string token_text = llama.doCompletion();
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
if (llama.multibyte_pending > 0) {
continue;
}
@@ -878,10 +956,22 @@ int main(int argc, char ** argv) {
const std::string to_send = llama.generated_text.substr(pos, stop_pos);
sent_count += to_send.size();
std::vector<completion_token_output> probs_output = {};
if (llama.params.n_probs > 0) {
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
if (probs_pos < probs_stop_pos) {
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
}
sent_token_probs_index = probs_stop_pos;
}
const json data = llama.has_next_token
? format_partial_response(to_send)
? format_partial_response(llama, to_send, probs_output)
// Generation is done, send extra information.
: format_final_response(llama, to_send);
: format_final_response(llama, to_send, llama.generated_token_probs);
const std::string str =
"data: " +

View File

@@ -2671,7 +2671,8 @@ struct train_params {
const char * fn_checkpoint_out;
const char * fn_model_out;
int seed;
uint32_t seed;
int n_ctx;
int n_embd;
int n_mult;

View File

@@ -214,6 +214,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -1970,7 +1975,6 @@ inline void ggml_cuda_op_add(
} else {
GGML_ASSERT(false);
}
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2002,7 +2006,6 @@ inline void ggml_cuda_op_mul(
// compute
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
}
(void) dst;
@@ -2023,7 +2026,6 @@ inline void ggml_cuda_op_silu(
// compute
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2046,7 +2048,6 @@ inline void ggml_cuda_op_rms_norm(
// compute
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2125,7 +2126,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
GGML_ASSERT(false);
break;
}
CUDA_CHECK(cudaGetLastError());
#ifdef GGML_CUDA_DMMV_F16
if (src1_convert_f16) {
@@ -2202,7 +2202,6 @@ inline void ggml_cuda_op_rope(
// compute
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) dst;
(void) src0_ddq_i;
@@ -2226,7 +2225,6 @@ inline void ggml_cuda_op_diag_mask_inf(
// compute
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) dst;
(void) src0_ddq_i;
@@ -2248,7 +2246,6 @@ inline void ggml_cuda_op_soft_max(
// compute
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2344,10 +2341,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
// if multiple GPUs are used they need to wait for the main GPU to finish
// if multiple devices are used they need to wait for the main device
// here an event is recorded that signifies that the main device has finished calculating the input data
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
}
for (int id = 0; id < g_device_count; ++id) {
@@ -2373,6 +2371,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
int64_t row_diff = row_high - row_low;
cudaSetDevice(id);
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
// wait for main GPU data if necessary
if (split && id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
}
if (src0_on_device && src0_is_contiguous) {
if (src0_is_f32) {
@@ -2448,8 +2452,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
}
const int64_t i11 = i13*ne12 + i12;
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
// for split tensors the data begins at i0 == i0_offset_low
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2509,6 +2511,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
// do the computation
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
// copy dst to host or other device if necessary
if (!dst_on_device) {
@@ -2538,6 +2541,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
}
}
// signify to main device that other device is done
if (split && g_device_count > 1 && id != g_main_device) {
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
}
}
}
}
@@ -2549,7 +2557,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
}
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaDeviceSynchronize());
if (src0_asq[id] > 0) {
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2564,6 +2571,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
}
}
// main device waits for all other devices to be finished
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
}
}
}
if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
}
}
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2803,25 +2825,32 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
extra->data_device[id] = buf;
if (backend == GGML_BACKEND_GPU_SPLIT) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
}
}
tensor->extra = extra;
}
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
}
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
for (int id = 0; id < g_device_count; ++id) {
if (extra->data_device[id] == nullptr) {
continue;
if (extra->data_device[id] != nullptr) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaFree(extra->data_device[id]));
}
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaFree(extra->data_device[id]));
if (extra->events[id] != nullptr) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
}
}
delete extra;

View File

@@ -8,10 +8,6 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
};
void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split);

View File

@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
void ggml_metal_free(struct ggml_metal_context * ctx) {
fprintf(stderr, "%s: deallocating\n", __func__);
for (int i = 0; i < ctx->n_buffers; ++i) {
[ctx->buffers[i].metal release];
}
free(ctx);
}

View File

@@ -1376,7 +1376,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];

62
ggml.c
View File

@@ -3846,6 +3846,41 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
// WARN:
// Mis-confguration can lead to problem that's hard to reason about:
// * At best it crash or talks nosense.
// * At worst it talks slightly difference but hard to perceive.
//
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
// Take care about compile options (e.g., GGML_USE_xxx).
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
static void ggml_setup_op_has_task_pass(void) {
{ // INIT
bool * p = GGML_OP_HAS_INIT;
p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_1D_S1_PH ] = true;
p[GGML_OP_CONV_1D_S2_PH ] = true;
p[GGML_OP_CONV_2D_SK_P0 ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
{ // FINALIZE
bool * p = GGML_OP_HAS_FINALIZE;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
}
//
// ggml context
//
@@ -4267,6 +4302,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_cl_init();
#endif
ggml_setup_op_has_task_pass();
is_first_call = false;
}
@@ -16791,9 +16828,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (node_n != -1) {
/* FINALIZE */
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
}
// distribute new work or execute it direct if 1T
@@ -16805,10 +16844,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us();
params.nth = node->n_tasks;
/* INIT */
params.type = GGML_TASK_INIT;
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
if (node->n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
@@ -16816,9 +16858,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
} else {
break;
}

3
ggml.h
View File

@@ -444,6 +444,9 @@ extern "C" {
// compute types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,

View File

@@ -66,6 +66,7 @@ enum e_model {
MODEL_65B,
};
static const size_t kB = 1024;
static const size_t MB = 1024*1024;
// computed for n_ctx == 2048
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
return k_sizes;
}
// amount of VRAM needed per batch size to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 512ull * kB },
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB },
};
return k_sizes;
}
// amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 128ull },
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
{ MODEL_65B, 416ull },
};
return k_sizes;
}
// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_vocab = 32000;
@@ -165,8 +194,8 @@ struct llama_layer {
};
struct llama_kv_cache {
struct ggml_tensor * k;
struct ggml_tensor * v;
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL;
@@ -253,7 +282,13 @@ struct llama_model {
struct llama_context {
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
#ifdef GGML_USE_METAL
~llama_context() {
if (ctx_metal) {
ggml_metal_free(ctx_metal);
}
}
#endif
std::mt19937 rng;
bool has_evaluated_once = false;
@@ -446,9 +481,7 @@ struct llama_file_loader {
std::string word = file.read_string(len);
float score = 0.0f;
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
file.read_raw(&score, sizeof(score));
}
file.read_raw(&score, sizeof(score));
vocab.token_to_id[word] = i;
@@ -1112,11 +1145,14 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch
} else {
vram_scratch = n_batch * MB;
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
ggml_cuda_set_scratch_size(vram_scratch);
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch / MB);
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch_base / kB, vram_scratch_per_context,
(vram_scratch + MB - 1) / MB); // round up
}
}
#endif // GGML_USE_CUBLAS
@@ -3219,7 +3255,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
return nread;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
// sanity checks
@@ -3273,6 +3309,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
return true;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
fprintf(stderr, "error loading session file: %s\n", err.what());
return false;
}
}
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");