Merge branch 'master' into pr/18039

This commit is contained in:
Georgi Gerganov
2026-03-16 15:41:24 +02:00
857 changed files with 80689 additions and 43153 deletions

View File

@@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-completion",
"llama-convert-llama2c-to-ggml",
"llama-cvector-generator",
"llama-debug",
"llama-diffusion-cli",
"llama-embedding",
"llama-eval-callback",
"llama-export-lora",
"llama-finetune",
"llama-fit-params",
"llama-gemma3-cli",
"llama-gen-docs",
"llama-gguf",
"llama-gguf-hash",
"llama-gguf-split",
"llama-gritlm",
"llama-idle",
"llama-imatrix",
"llama-infill",
"llama-mtmd-cli",
"llama-llava-clip-quantize-cli",
"llama-llava-cli",
"llama-lookahead",
"llama-lookup",
"llama-lookup-create",
"llama-lookup-merge",
"llama-lookup-stats",
"llama-minicpmv-cli",
"llama-mtmd-cli",
"llama-parallel",
"llama-passkey",
"llama-perplexity",
@@ -1279,13 +1284,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
{"-ctxcp", "--ctx-checkpoints", "--swa-checkpoints"}, "N",
string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-cpent", "--checkpoint-every-n-tokens"}, "N",
string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
[](common_params & params, int value) {
params.checkpoint_every_nt = value;
}
).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-cram", "--cache-ram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
@@ -1578,7 +1590,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--temp"}, "N",
{"--temp", "--temperature"}, "N",
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
@@ -1611,7 +1623,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--top-nsigma"}, "N",
{"--top-nsigma", "--top-n-sigma"}, "N",
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
[](common_params & params, const std::string & value) {
params.sampling.top_n_sigma = std::stof(value);
@@ -1634,7 +1646,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--typical"}, "N",
{"--typical", "--typical-p"}, "N",
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
[](common_params & params, const std::string & value) {
params.sampling.typ_p = std::stof(value);
@@ -2399,7 +2411,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.fit_params = false;
} else {
throw std::runtime_error(
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_ARG_FIT"));
@@ -2420,11 +2432,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
);
}
if (split_arg.size() == 1) {
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoull(split_arg[0]) * 1024*1024);
return;
}
for (size_t i = 0; i < split_arg.size(); i++) {
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
params.fit_params_target[i] = std::stoull(split_arg[i]) * 1024*1024;
}
}
).set_env("LLAMA_ARG_FIT_TARGET"));
@@ -2520,11 +2532,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"-a", "--alias"}, "STRING",
"set alias for model name (to be used by REST API)",
"set model name aliases, comma-separated (to be used by API)",
[](common_params & params, const std::string & value) {
params.model_alias = value;
for (auto & alias : string_split<std::string>(value, ',')) {
alias = string_strip(alias);
if (!alias.empty()) {
params.model_alias.insert(alias);
}
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
add_opt(common_arg(
{"--tags"}, "STRING",
"set model tags, comma-separated (informational, not used for routing)",
[](common_params & params, const std::string & value) {
for (auto & tag : string_split<std::string>(value, ',')) {
tag = string_strip(tag);
if (!tag.empty()) {
params.model_tags.insert(tag);
}
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -2642,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.out_file = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2810,6 +2840,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.webui_config_json = read_file(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
add_opt(common_arg(
{"--webui-mcp-proxy"},
{"--no-webui-mcp-proxy"},
string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
@@ -2881,6 +2919,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
if (item.key() == "enable_thinking") {
LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
"Use --reasoning on / --reasoning off instead.\n");
}
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
@@ -3016,14 +3058,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.reasoning_format = common_reasoning_format_from_name(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"-rea", "--reasoning"}, "[on|off|auto]",
"Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
[](common_params & params, const std::string & value) {
if (is_truthy(value)) {
params.enable_reasoning = 1;
params.default_template_kwargs["enable_thinking"] = "true";
} else if (is_falsey(value)) {
params.enable_reasoning = 0;
params.default_template_kwargs["enable_thinking"] = "false";
} else if (is_autoy(value)) {
params.enable_reasoning = -1;
} else {
throw std::invalid_argument(
string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
}
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
[](common_params & params, int value) {
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
if (value < -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--reasoning-budget-message"}, "MESSAGE",
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
[](common_params & params, const std::string & value) {
params.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@@ -3582,6 +3649,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"--check"},
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
[](common_params & params) {
params.check = true;
}
).set_examples({LLAMA_EXAMPLE_RESULTS}));
add_opt(common_arg(
{"--save-logits"},
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),