mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-03-17 16:44:07 +00:00
llama : update LoRA API. + fix excessive graph reserves (#19280)
* Refactoring to use new llama_put_adapter_loras * cont : alternative lora API --------- Co-authored-by: Jake Chavis <jakechavis6@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -1223,7 +1223,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_apply_adapter_cvec(
|
int err = llama_set_adapter_cvec(
|
||||||
lctx,
|
lctx,
|
||||||
cvec.data.data(),
|
cvec.data.data(),
|
||||||
cvec.data.size(),
|
cvec.data.size(),
|
||||||
@@ -1325,12 +1325,15 @@ std::string get_model_endpoint() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||||
llama_clear_adapter_lora(ctx);
|
std::vector<llama_adapter_lora *> loras;
|
||||||
for (auto & la : lora) {
|
std::vector<float> scales;
|
||||||
if (la.scale != 0.0f) {
|
|
||||||
llama_set_adapter_lora(ctx, la.ptr, la.scale);
|
for (auto & la: lora) {
|
||||||
}
|
loras.push_back(la.ptr);
|
||||||
|
scales.push_back(la.scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
|
|||||||
@@ -656,21 +656,12 @@ extern "C" {
|
|||||||
|
|
||||||
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||||
|
|
||||||
// Add a loaded LoRA adapter to given context
|
// Set LoRa adapters on the context. Will only modify if the adapters currently in context are different.
|
||||||
// This will not modify model's weight
|
LLAMA_API int32_t llama_set_adapters_lora(
|
||||||
LLAMA_API int32_t llama_set_adapter_lora(
|
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_adapter_lora * adapter,
|
struct llama_adapter_lora ** adapters,
|
||||||
float scale);
|
size_t n_adapters,
|
||||||
|
float * scales);
|
||||||
// Remove a specific LoRA adapter from given context
|
|
||||||
// Return -1 if the adapter is not present in the context
|
|
||||||
LLAMA_API int32_t llama_rm_adapter_lora(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_adapter_lora * adapter);
|
|
||||||
|
|
||||||
// Remove all LoRA adapters from given context
|
|
||||||
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
// the currently loaded vector.
|
// the currently loaded vector.
|
||||||
@@ -678,7 +669,7 @@ extern "C" {
|
|||||||
// to an n_embd x n_layers buffer starting from layer 1.
|
// to an n_embd x n_layers buffer starting from layer 1.
|
||||||
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
// See llama_control_vector_load in common to load a control vector.
|
// See llama_control_vector_load in common to load a control vector.
|
||||||
LLAMA_API int32_t llama_apply_adapter_cvec(
|
LLAMA_API int32_t llama_set_adapter_cvec(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
|
|||||||
@@ -1057,51 +1057,43 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_context::set_adapter_lora(
|
void llama_context::set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
|
||||||
llama_adapter_lora * adapter,
|
LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
|
||||||
float scale) {
|
|
||||||
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
|
|
||||||
|
|
||||||
if (auto it = loras.find(adapter); it != loras.end()) {
|
if (adapters_lora_are_same(adapters, n_adapters, scales)) {
|
||||||
if (it->second == scale) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
loras[adapter] = scale;
|
|
||||||
|
|
||||||
sched_need_reserve = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llama_context::rm_adapter_lora(
|
|
||||||
llama_adapter_lora * adapter) {
|
|
||||||
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
|
|
||||||
|
|
||||||
auto it = loras.find(adapter);
|
|
||||||
if (it != loras.end()) {
|
|
||||||
loras.erase(it);
|
|
||||||
|
|
||||||
sched_need_reserve = true;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_context::clear_adapter_lora() {
|
|
||||||
LLAMA_LOG_DEBUG("%s: call\n", __func__);
|
|
||||||
|
|
||||||
if (loras.empty()) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
loras.clear();
|
loras.clear();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n_adapters; i ++) {
|
||||||
|
if (scales[i] != 0.0f) {
|
||||||
|
loras[adapters[i]] = scales[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sched_need_reserve = true;
|
sched_need_reserve = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_context::apply_adapter_cvec(
|
bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
|
||||||
|
|
||||||
|
if (n_adapters != loras.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n_adapters; i ++) {
|
||||||
|
auto it = loras.find(adapters[i]);
|
||||||
|
|
||||||
|
if (it == loras.end() || it->second != scales[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_context::set_adapter_cvec(
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
int32_t n_embd,
|
int32_t n_embd,
|
||||||
@@ -3209,35 +3201,28 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
|
|||||||
|
|
||||||
// llama adapter API
|
// llama adapter API
|
||||||
|
|
||||||
int32_t llama_set_adapter_lora(
|
int32_t llama_set_adapters_lora(
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
llama_adapter_lora * adapter,
|
llama_adapter_lora ** adapters,
|
||||||
float scale) {
|
size_t n_adapters,
|
||||||
ctx->set_adapter_lora(adapter, scale);
|
float * scales) {
|
||||||
|
if (adapters == nullptr || scales == nullptr) {
|
||||||
|
GGML_ASSERT(n_adapters == 0 && "invalid llama_set_adapters_lora call");
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->set_adapters_lora(adapters, n_adapters, scales);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_rm_adapter_lora(
|
int32_t llama_set_adapter_cvec(
|
||||||
llama_context * ctx,
|
|
||||||
llama_adapter_lora * adapter) {
|
|
||||||
bool res = ctx->rm_adapter_lora(adapter);
|
|
||||||
|
|
||||||
return res ? 0 : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_clear_adapter_lora(llama_context * ctx) {
|
|
||||||
ctx->clear_adapter_lora();
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t llama_apply_adapter_cvec(
|
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
int32_t n_embd,
|
int32_t n_embd,
|
||||||
int32_t il_start,
|
int32_t il_start,
|
||||||
int32_t il_end) {
|
int32_t il_end) {
|
||||||
bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end);
|
bool res = ctx->set_adapter_cvec(data, len, n_embd, il_start, il_end);
|
||||||
|
|
||||||
return res ? 0 : -1;
|
return res ? 0 : -1;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,16 +105,11 @@ struct llama_context {
|
|||||||
void set_causal_attn(bool value);
|
void set_causal_attn(bool value);
|
||||||
void set_warmup(bool value);
|
void set_warmup(bool value);
|
||||||
|
|
||||||
void set_adapter_lora(
|
void set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales);
|
||||||
llama_adapter_lora * adapter,
|
|
||||||
float scale);
|
|
||||||
|
|
||||||
bool rm_adapter_lora(
|
bool adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales);
|
||||||
llama_adapter_lora * adapter);
|
|
||||||
|
|
||||||
void clear_adapter_lora();
|
bool set_adapter_cvec(
|
||||||
|
|
||||||
bool apply_adapter_cvec(
|
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
int32_t n_embd,
|
int32_t n_embd,
|
||||||
|
|||||||
Reference in New Issue
Block a user