mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-14 04:54:06 +00:00
Merge branch 'master' into pr/18039
This commit is contained in:
106
common/arg.cpp
106
common/arg.cpp
@@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-completion",
|
||||
"llama-convert-llama2c-to-ggml",
|
||||
"llama-cvector-generator",
|
||||
"llama-debug",
|
||||
"llama-diffusion-cli",
|
||||
"llama-embedding",
|
||||
"llama-eval-callback",
|
||||
"llama-export-lora",
|
||||
"llama-finetune",
|
||||
"llama-fit-params",
|
||||
"llama-gemma3-cli",
|
||||
"llama-gen-docs",
|
||||
"llama-gguf",
|
||||
"llama-gguf-hash",
|
||||
"llama-gguf-split",
|
||||
"llama-gritlm",
|
||||
"llama-idle",
|
||||
"llama-imatrix",
|
||||
"llama-infill",
|
||||
"llama-mtmd-cli",
|
||||
"llama-llava-clip-quantize-cli",
|
||||
"llama-llava-cli",
|
||||
"llama-lookahead",
|
||||
"llama-lookup",
|
||||
"llama-lookup-create",
|
||||
"llama-lookup-merge",
|
||||
"llama-lookup-stats",
|
||||
"llama-minicpmv-cli",
|
||||
"llama-mtmd-cli",
|
||||
"llama-parallel",
|
||||
"llama-passkey",
|
||||
"llama-perplexity",
|
||||
@@ -1279,13 +1284,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
||||
{"-ctxcp", "--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
||||
string_format("max number of context checkpoints to create per slot (default: %d)"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
||||
[](common_params & params, int value) {
|
||||
params.n_ctx_checkpoints = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"-cpent", "--checkpoint-every-n-tokens"}, "N",
|
||||
string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
|
||||
[](common_params & params, int value) {
|
||||
params.checkpoint_every_nt = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"-cram", "--cache-ram"}, "N",
|
||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||
@@ -1578,7 +1590,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--temp"}, "N",
|
||||
{"--temp", "--temperature"}, "N",
|
||||
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.temp = std::stof(value);
|
||||
@@ -1611,7 +1623,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--top-nsigma"}, "N",
|
||||
{"--top-nsigma", "--top-n-sigma"}, "N",
|
||||
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.top_n_sigma = std::stof(value);
|
||||
@@ -1634,7 +1646,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--typical"}, "N",
|
||||
{"--typical", "--typical-p"}, "N",
|
||||
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.typ_p = std::stof(value);
|
||||
@@ -2399,7 +2411,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.fit_params = false;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
|
||||
string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT"));
|
||||
@@ -2420,11 +2432,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
);
|
||||
}
|
||||
if (split_arg.size() == 1) {
|
||||
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
||||
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoull(split_arg[0]) * 1024*1024);
|
||||
return;
|
||||
}
|
||||
for (size_t i = 0; i < split_arg.size(); i++) {
|
||||
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
||||
params.fit_params_target[i] = std::stoull(split_arg[i]) * 1024*1024;
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||
@@ -2520,11 +2532,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-a", "--alias"}, "STRING",
|
||||
"set alias for model name (to be used by REST API)",
|
||||
"set model name aliases, comma-separated (to be used by API)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model_alias = value;
|
||||
for (auto & alias : string_split<std::string>(value, ',')) {
|
||||
alias = string_strip(alias);
|
||||
if (!alias.empty()) {
|
||||
params.model_alias.insert(alias);
|
||||
}
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
|
||||
add_opt(common_arg(
|
||||
{"--tags"}, "STRING",
|
||||
"set model tags, comma-separated (informational, not used for routing)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (auto & tag : string_split<std::string>(value, ',')) {
|
||||
tag = string_strip(tag);
|
||||
if (!tag.empty()) {
|
||||
params.model_tags.insert(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
|
||||
add_opt(common_arg(
|
||||
{"-m", "--model"}, "FNAME",
|
||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||
@@ -2642,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
@@ -2810,6 +2840,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.webui_config_json = read_file(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--webui-mcp-proxy"},
|
||||
{"--no-webui-mcp-proxy"},
|
||||
string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.webui_mcp_proxy = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
|
||||
add_opt(common_arg(
|
||||
{"--webui"},
|
||||
{"--no-webui"},
|
||||
@@ -2881,6 +2919,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
auto parsed = json::parse(value);
|
||||
for (const auto & item : parsed.items()) {
|
||||
if (item.key() == "enable_thinking") {
|
||||
LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
|
||||
"Use --reasoning on / --reasoning off instead.\n");
|
||||
}
|
||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
}
|
||||
@@ -3016,14 +3058,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.reasoning_format = common_reasoning_format_from_name(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
|
||||
add_opt(common_arg(
|
||||
{"-rea", "--reasoning"}, "[on|off|auto]",
|
||||
"Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
params.enable_reasoning = 1;
|
||||
params.default_template_kwargs["enable_thinking"] = "true";
|
||||
} else if (is_falsey(value)) {
|
||||
params.enable_reasoning = 0;
|
||||
params.default_template_kwargs["enable_thinking"] = "false";
|
||||
} else if (is_autoy(value)) {
|
||||
params.enable_reasoning = -1;
|
||||
} else {
|
||||
throw std::invalid_argument(
|
||||
string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-budget"}, "N",
|
||||
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
||||
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
|
||||
[](common_params & params, int value) {
|
||||
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
||||
if (value < -1) { throw std::invalid_argument("invalid value"); }
|
||||
params.reasoning_budget = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-budget-message"}, "MESSAGE",
|
||||
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.reasoning_budget_message = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||
string_format(
|
||||
@@ -3582,6 +3649,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg(
|
||||
{"--check"},
|
||||
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.check = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_RESULTS}));
|
||||
add_opt(common_arg(
|
||||
{"--save-logits"},
|
||||
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
||||
|
||||
Reference in New Issue
Block a user