Compare commits

...

5 Commits

Author SHA1 Message Date
compilade
515f7d0d4f llama : fix quantization of shared token_embd (#5944) 2024-03-08 17:53:37 +02:00
Pierrick Hymbert
76e868821a server: metrics: add llamacpp:prompt_seconds_total and llamacpp:tokens_predicted_seconds_total, reset bucket only on /metrics. Fix values cast to int. Add Process-Start-Time-Unix header. (#5937)
Closes #5850
2024-03-08 12:25:04 +01:00
Don Mahurin
e457fb3540 llama : assume tied weights if lm_head/output weights is missing (#5824)
This is to support model configurations with "tie_word_embeddings" set to true.

Co-authored-by: Don Mahurin <2797413+dmahurin@users.noreply.github.com>
2024-03-08 12:41:50 +02:00
Georgi Gerganov
af37fd8b30 server : fix EOS token detection with disabled cache (#5938) 2024-03-08 12:40:02 +02:00
UEXTM.com
581ed5c4fe log : fix MSVC compile errors (#5643)
MSVC gives the following error with the existing macros:
`Error C2059 : syntax error: ','`

This patch adds `##` as a prefix to `__VA_ARGS__` to address this error.
2024-03-08 11:35:04 +02:00
5 changed files with 65 additions and 21 deletions

View File

@@ -297,7 +297,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// Main TEE macro.
@@ -311,7 +311,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
@@ -319,8 +319,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE

View File

@@ -335,8 +335,12 @@ struct server_slot {
};
struct server_metrics {
const int64_t t_start = ggml_time_us();
uint64_t n_prompt_tokens_processed_total = 0;
uint64_t t_prompt_processing_total = 0;
uint64_t n_tokens_predicted_total = 0;
uint64_t t_tokens_generation_total = 0;
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
@@ -348,12 +352,14 @@ struct server_metrics {
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
t_prompt_processing += slot.t_prompt_processing;
t_prompt_processing_total += slot.t_prompt_processing;
}
void on_prediction(const server_slot &slot) {
n_tokens_predicted_total += slot.n_decoded;
n_tokens_predicted += slot.n_decoded;
t_tokens_generation += slot.t_token_generation;
n_tokens_predicted_total += slot.n_decoded;
n_tokens_predicted += slot.n_decoded;
t_tokens_generation += slot.t_token_generation;
t_tokens_generation_total += slot.t_token_generation;
}
void reset_bucket() {
@@ -1123,7 +1129,7 @@ struct server_context {
});
}
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) {
if (result.tok == llama_token_eos(model)) {
slot.stopped_eos = true;
slot.has_next_token = false;
@@ -1502,9 +1508,12 @@ struct server_context {
{ "idle", n_idle_slots },
{ "processing", n_processing_slots },
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
{ "t_start", metrics.t_start},
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
{ "t_tokens_generation_total", metrics.t_tokens_generation_total},
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
{ "t_prompt_processing_total", metrics.t_prompt_processing_total},
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
{ "t_prompt_processing", metrics.t_prompt_processing},
@@ -1517,7 +1526,9 @@ struct server_context {
{ "slots", slots_data },
};
metrics.reset_bucket();
if (json_value(task.data, "reset_bucket", false)) {
metrics.reset_bucket();
}
queue_results.send(res);
} break;
}
@@ -2709,6 +2720,7 @@ int main(int argc, char ** argv) {
task.id_multi = -1;
task.id_target = -1;
task.type = SERVER_TASK_TYPE_METRICS;
task.data.push_back({{"reset_bucket", true}});
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);
@@ -2732,20 +2744,28 @@ int main(int argc, char ** argv) {
{"counter", {{
{"name", "prompt_tokens_total"},
{"help", "Number of prompt tokens processed."},
{"value", data["n_prompt_tokens_processed_total"]}
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
}, {
{"name", "prompt_seconds_total"},
{"help", "Prompt process time"},
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
}, {
{"name", "tokens_predicted_total"},
{"help", "Number of generation tokens processed."},
{"value", data["n_tokens_predicted_total"]}
{"value", (uint64_t) data["n_tokens_predicted_total"]}
}, {
{"name", "tokens_predicted_seconds_total"},
{"help", "Predict process time"},
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
}}},
{"gauge", {{
{"name", "prompt_tokens_seconds"},
{"help", "Average prompt throughput in tokens/s."},
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
{"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.}
},{
{"name", "predicted_tokens_seconds"},
{"help", "Average generation throughput in tokens/s."},
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
{"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.}
},{
{"name", "kv_cache_usage_ratio"},
{"help", "KV-cache usage. 1 means 100 percent usage."},
@@ -2753,15 +2773,15 @@ int main(int argc, char ** argv) {
},{
{"name", "kv_cache_tokens"},
{"help", "KV-cache tokens."},
{"value", data["kv_cache_tokens_count"]}
{"value", (uint64_t) data["kv_cache_tokens_count"]}
},{
{"name", "requests_processing"},
{"help", "Number of request processing."},
{"value", data["processing"]}
{"value", (uint64_t) data["processing"]}
},{
{"name", "requests_deferred"},
{"help", "Number of request deferred."},
{"value", data["deferred"]}
{"value", (uint64_t) data["deferred"]}
}}}
};
@@ -2775,13 +2795,16 @@ int main(int argc, char ** argv) {
const std::string name = metric_def["name"];
const std::string help = metric_def["help"];
auto value = json_value(metric_def, "value", 0);
auto value = json_value(metric_def, "value", 0.);
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
<< "# TYPE llamacpp:" << name << " " << type << "\n"
<< "llamacpp:" << name << " " << value << "\n";
}
}
const int64_t t_start = data["t_start"];
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
res.status = 200; // HTTP OK
});

View File

@@ -29,6 +29,7 @@ Feature: llama.cpp server
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
And prometheus metrics are exposed
And metric llamacpp:tokens_predicted is <n_predicted>
Examples: Prompts
| prompt | n_predict | re_content | n_predicted |

View File

@@ -586,14 +586,24 @@ async def step_prometheus_metrics_exported(context):
metric_exported = False
if context.debug:
print(f"/metrics answer:\n{metrics_raw}\n")
context.metrics = {}
for metric in parser.text_string_to_metric_families(metrics_raw):
match metric.name:
case "llamacpp:kv_cache_usage_ratio":
assert len(metric.samples) > 0
metric_exported = True
context.metrics[metric.name] = metric
assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
assert metric_exported, "No metrics exported"
@step(u'metric {metric_name} is {metric_value:d}')
def step_assert_metric_value(context, metric_name, metric_value):
if metric_name not in context.metrics:
assert False, f"no metric {metric_name} in {context.metrics.keys()}"
assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
@step(u'available models')
def step_available_models(context):
# openai client always expects an api_key
@@ -879,7 +889,6 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
f' {n_predicted} <> {expected_predicted_n}')
async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks)
if context.debug:

View File

@@ -3888,7 +3888,13 @@ static bool llm_load_tensors(
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
if (model.arch != LLM_ARCH_MINICPM){
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
}
}
}
@@ -10967,6 +10973,9 @@ struct quantize_state_internal {
bool has_imatrix = false;
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
@@ -11064,8 +11073,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
@@ -11454,6 +11462,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
else if (name.find("ffn_up") != std::string::npos) {
++qs.n_ffn_up;
}
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}
}
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",