Compare commits

...

11 Commits
b8244 ... b8255

Author SHA1 Message Date
Georgi Gerganov
43e1cbd6c1 models : fix assert in mamba2 graph (#20270) 2026-03-09 13:15:15 +02:00
Georgi Gerganov
107d599952 server : add kill switch when server is stuck (#20277) 2026-03-09 10:33:12 +02:00
Aman Gupta
e8bbc736cb ggml-cuda: disable gdn for musa (#20278) 2026-03-09 16:15:36 +08:00
ddh0
b518195101 llama-quant : left-align tensor names in output (#20117) 2026-03-09 09:28:41 +02:00
Aman Gupta
e2763a6723 contributing: limit open PRs for new contributors to 1 (#20036) 2026-03-09 15:05:34 +08:00
Bertay Eren
0beb8db3a0 ggml-vulkan: add SGN operator, auto-generate Vulkan.csv and ops.md (#20219) 2026-03-09 07:24:16 +01:00
Ruben Ortlam
b2f460bd3c vulkan: skip zero size tensors in backend copies (#20233) 2026-03-09 07:23:45 +01:00
Michael Huang
5f4cdac385 cuda : display total and free VRAM capacity during device initialization (#20185) 2026-03-09 12:45:43 +08:00
Aaron Teo
ae87863dc1 llama-bench: introduce -hf and -hff flags & use --mmap 1 by default (#20211) 2026-03-09 09:05:44 +08:00
Piotr Wilkin (ilintar)
97c64fbdbd PEG parser for LFM2 (#20251)
* PEG parser for LFM2

* Simplify using python_value()
2026-03-09 01:11:22 +01:00
Georgi Gerganov
d417bc43dd server : do not create checkpoints right after mtmd chunks (#20232) 2026-03-08 22:16:46 +02:00
17 changed files with 473 additions and 133 deletions

View File

@@ -39,6 +39,7 @@ Before submitting your PR:
- For intricate features, consider opening a feature request first to discuss and align expectations
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
After submitting your PR:
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability

View File

@@ -476,6 +476,74 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
return force_tool_calls ? section : optional(section);
}
// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const nlohmann::json & tools,
bool parallel_tool_calls) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
auto tool_choices = choice();
for (const auto & tool_def : tools) {
if (!tool_def.contains("function")) {
continue;
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
auto args = eps();
if (params.contains("properties") && !params["properties"].empty()) {
auto arg_choice = choice();
for (const auto & el : params["properties"].items()) {
const std::string & prop_name = el.key();
const auto & prop_def = el.value();
bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");
auto arg_name_parser = literal(prop_name);
common_peg_parser arg_value_parser = eps();
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(json_string_content()) + literal("\""),
literal("'") + tool_arg_string_value(json_string_content()) + literal("'")
});
if (is_string_type) {
arg_value_parser = string_value_parser;
} else {
arg_value_parser = tool_arg_value(python_value());
}
// Full argument: name="value" or name=value
auto arg_rule = tool_arg(
tool_arg_open(eps()) +
tool_arg_name(arg_name_parser) +
literal("=") +
arg_value_parser +
tool_arg_close(eps())
);
arg_choice |= arg_rule;
}
args = arg_choice + zero_or_more("," + space() + arg_choice);
}
auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
space() + tool_args(args) + space() + tool_close(literal(")"))
);
tool_choices |= rule("tool-" + name, tool_parser);
}
if (parallel_tool_calls) {
return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
}
return "[" + space() + tool_choices + space() + "]";
}
// Helper: Parse dot notation key into prefix and field name
static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
auto dot_pos = key.find('.');

View File

@@ -112,6 +112,11 @@ class common_chat_peg_builder : public common_peg_parser_builder {
bool parallel_tool_calls,
bool force_tool_calls);
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::json & tools,
bool parallel_tool_calls);
private:
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,

View File

@@ -1274,6 +1274,82 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}
// LFM2 format:
// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
// - Content: text after reasoning (optional)
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
// Tool calls can appear multiple times (parallel tool calls)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::templates_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_list_start|>",
"<|tool_list_end|>",
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.literal(TOOL_CALL_END)
)
);
auto content = p.content(p.until(TOOL_CALL_START));
return reasoning + content + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}
return data;
}
namespace workaround {
// if first message is system and template does not support it, merge it with next message
@@ -1422,6 +1498,14 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
return common_chat_params_init_kimi_k2(tmpl, params);
}
// LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
// Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
try {
LOG_DBG("Using differential autoparser\n");
struct autoparser::autoparser autoparser;

View File

@@ -47,6 +47,7 @@ Legend:
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| GATED_DELTA_NET | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -92,7 +93,7 @@ Legend:
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | | ✅ | ❌ | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |

View File

@@ -1,8 +1,8 @@
"backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
"Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
@@ -85,8 +85,8 @@
"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
"Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
"Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
@@ -13591,3 +13591,16 @@
"Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan"
"Vulkan0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","0","no","Vulkan"
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","0","no","Vulkan"
Can't render this file because it is too large.

View File

@@ -205,7 +205,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
total_vram += prop.totalGlobalMem;
}
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices (Total VRAM: %zu MiB):\n",
__func__, info.device_count, (size_t)(total_vram / (1024 * 1024)));
total_vram = 0;
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
for (int id = 0; id < info.device_count; ++id) {
@@ -243,6 +250,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else
info.devices[id].supports_cooperative_launch = false;
#endif // !(GGML_USE_MUSA)
// cudaMemGetInfo returns info for the current device
size_t free_mem;
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
#if defined(GGML_USE_HIP)
info.devices[id].smpbo = prop.sharedMemPerBlock;
@@ -257,22 +270,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
device_vmm ? "yes" : "no", prop.warpSize);
device_vmm ? "yes" : "no", prop.warpSize,
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
#elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -4976,9 +4992,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_LEAKY_RELU:
case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_GATED_DELTA_NET:
case GGML_OP_RWKV_WKV7:
return true;
case GGML_OP_GATED_DELTA_NET:
//TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
#ifdef GGML_USE_MUSA
return false;
#else
return true;
#endif // GGML_USE_MUSA
case GGML_OP_FLASH_ATTN_EXT:
return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
case GGML_OP_CROSS_ENTROPY_LOSS:

View File

@@ -763,6 +763,7 @@ struct vk_device_struct {
vk_pipeline pipeline_ceil[2];
vk_pipeline pipeline_floor[2];
vk_pipeline pipeline_trunc[2];
vk_pipeline pipeline_sgn[2];
vk_pipeline pipeline_add1_f16_f16;
vk_pipeline pipeline_add1_f16_f32;
@@ -4393,6 +4394,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_UNARY(ceil)
CREATE_UNARY(floor)
CREATE_UNARY(trunc)
CREATE_UNARY(sgn)
#undef CREATE_UNARY
#define CREATE_UNARY_RTE(name) \
@@ -9281,6 +9283,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16];
case GGML_UNARY_OP_TRUNC:
return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16];
case GGML_UNARY_OP_SGN:
return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16];
default:
break;
}
@@ -12875,6 +12879,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_TRUNC:
case GGML_UNARY_OP_SGN:
ggml_vk_unary(ctx, compute_ctx, src0, node);
break;
case GGML_UNARY_OP_XIELU:
@@ -13253,6 +13258,10 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
vk_buffer buf = buf_ctx->dev_buffer;
if (size == 0) {
return;
}
uint32_t val32 = (uint32_t)value * 0x01010101;
ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size);
}
@@ -13262,6 +13271,10 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
vk_buffer buf = buf_ctx->dev_buffer;
if (size == 0) {
return;
}
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
}
@@ -13269,12 +13282,20 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
if (size == 0) {
return;
}
vk_buffer buf = buf_ctx->dev_buffer;
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
}
static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_nbytes(src) == 0) {
return true;
}
if (ggml_backend_buffer_is_vk(src->buffer)) {
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
@@ -13464,6 +13485,10 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
if (size == 0) {
return;
}
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
vk_context cpy_ctx;
@@ -13507,6 +13532,10 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
if (size == 0) {
return;
}
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
@@ -13533,9 +13562,14 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
}
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
// Skip zero-size tensors
if (ggml_nbytes(src) == 0) {
return true;
}
if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) {
return false;
}
@@ -14975,6 +15009,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_TRUNC:
case GGML_UNARY_OP_SGN:
return ggml_is_contiguous(op->src[0]) &&
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
@@ -16141,6 +16176,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
case GGML_UNARY_OP_TRUNC:
tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
break;
case GGML_UNARY_OP_SGN:
tensor_clone = ggml_sgn(ggml_ctx, src_clone[0]);
break;
default:
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ABORT("fatal error");

View File

@@ -0,0 +1,21 @@
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
data_d[i] = D_TYPE(sign(float(data_a[i])));
}

View File

@@ -871,6 +871,8 @@ void process_shaders() {
string_to_spv("elu_f32", "elu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("xielu_f16", "xielu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("xielu_f32", "xielu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("sgn_f16", "sgn.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("sgn_f32", "sgn.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});

View File

@@ -6,7 +6,7 @@
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "You can use the following tools: <|tool_list_start|>[" -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
@@ -17,7 +17,6 @@
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
{{- '**IMPORTANT**: The syntax for calling the tools is: <|tool_call_start|>JSON tool call goes here<|tool_call_end|>. Please only call tools in the specified manner.' -}}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
@@ -30,18 +29,9 @@
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
{%- elif message["role"] == "assistant" -%}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\n<|tool_call_start|>\n{"name": "' + tool_call.name + '", "arguments": ' + (tool_call.arguments if tool_call.arguments is string else tool_call.arguments | tojson) + '}\n<|tool_call_end|>\n' }}
{%- endfor %}
{%- endif %}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}
{%- endif -%}

View File

@@ -1,37 +0,0 @@
{{- bos_token -}}
{%- set system_prompt = "" -%}
{%- set ns = namespace(system_prompt="") -%}
{%- if messages[0]["role"] == "system" -%}
{%- set ns.system_prompt = messages[0]["content"] -%}
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
{%- endif -%}
{%- set ns.system_prompt = ns.system_prompt + tool -%}
{%- if not loop.last -%}
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
{%- endif -%}
{%- for message in messages -%}
{{- "<|im_start|>" + message["role"] + "\n" -}}
{%- set content = message["content"] -%}
{%- if content is not string -%}
{%- set content = content | tojson -%}
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}

View File

@@ -778,7 +778,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
ml.load_data_for(tensor);
}
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),

View File

@@ -155,7 +155,6 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
const auto kv_head = mctx_cur->get_head();
const int64_t n_embd = hparams.n_embd;
const int64_t d_conv = hparams.ssm_d_conv;
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
@@ -170,7 +169,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(d_inner % n_head == 0);
GGML_ASSERT(d_inner % (n_group*n_embd) == 0);
GGML_ASSERT(d_inner % (n_group*d_state) == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);

View File

@@ -2387,6 +2387,78 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
}
// LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|>
{
auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Single tool call without reasoning
tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Tool call with reasoning (enable_thinking=true)
tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Multiple tool calls (parallel)
tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Tool call with reasoning and content
tst.test("<think>I need to call a function</think>"
"Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"I need to call a function", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
))
.run();
// Python tool with multiline code in string
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
})
.run();
// Partial tool call (streaming)
tst.test("<|tool_call_start|>[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
}
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
// Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
{

View File

@@ -20,6 +20,7 @@
#include <unordered_set>
#include "common.h"
#include "download.h"
#include "ggml.h"
#include "llama.h"
@@ -312,6 +313,9 @@ static std::vector<int> parse_int_range(const std::string & s) {
struct cmd_params {
std::vector<std::string> model;
std::vector<std::string> hf_repo;
std::vector<std::string> hf_file;
std::string hf_token;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
@@ -351,6 +355,9 @@ struct cmd_params {
static const cmd_params cmd_params_defaults = {
/* model */ { "models/7B/ggml-model-q4_0.gguf" },
/* hf_repo */ {},
/* hf_file */ {},
/* hf_token */ "",
/* n_prompt */ { 512 },
/* n_gen */ { 128 },
/* n_pg */ {},
@@ -372,7 +379,7 @@ static const cmd_params cmd_params_defaults = {
/* devices */ { {} },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
/* use_mmap */ { false },
/* use_mmap */ { true },
/* use_direct_io */ { false },
/* embeddings */ { false },
/* no_op_offload */ { false },
@@ -393,74 +400,57 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n");
printf("options:\n");
printf(" -h, --help\n");
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
cmd_params_defaults.reps);
printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n",
cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
output_format_str(cmd_params_defaults.output_format_stderr));
printf(" --list-devices list available devices and exit\n");
printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n");
printf(" --no-warmup skip warmup runs before benchmarking\n");
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" --list-devices list available devices and exit\n");
printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n");
printf(" --no-warmup skip warmup runs before benchmarking\n");
if (llama_supports_rpc()) {
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
}
printf("\n");
printf("test parameters:\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n",
join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -d, --n-depth <n> (default: %s)\n",
join(cmd_params_defaults.n_depth, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n",
join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n",
join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n",
join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n",
join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n",
join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n",
join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n",
join(cmd_params_defaults.use_direct_io, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n",
join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n");
printf(" (default: unused)\n");
printf(" -hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
printf(" (default: unused)\n");
printf(" -hft, --hf-token <token> Hugging Face access token\n");
printf(" (default: value from HF_TOKEN environment variable)\n");
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
printf(" (default: disabled)\n");
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" --no-host <0|1> (default: %s)\n",
join(cmd_params_defaults.no_host, ",").c_str());
printf(" (default: disabled)\n");
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" --no-host <0|1> (default: %s)\n", join(cmd_params_defaults.no_host, ",").c_str());
printf("\n");
printf(
"Multiple values can be given for each parameter by separating them with ','\n"
@@ -514,6 +504,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.progress = cmd_params_defaults.progress;
params.no_warmup = cmd_params_defaults.no_warmup;
if (const char * env = getenv("HF_TOKEN")) {
params.hf_token = env;
}
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
@@ -531,6 +525,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-hf" || arg == "-hfr" || arg == "--hf-repo") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.hf_repo.insert(params.hf_repo.end(), p.begin(), p.end());
} else if (arg == "-hff" || arg == "--hf-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.hf_file.insert(params.hf_file.end(), p.begin(), p.end());
} else if (arg == "-hft" || arg == "--hf-token") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.hf_token = argv[i];
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -961,6 +975,44 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
exit(1);
}
if (!params.hf_repo.empty()) {
for (size_t i = 0; i < params.hf_repo.size(); i++) {
common_params_model model;
// step 1: no `-hff` provided, we auto-detect based on the `-hf` flag
if (params.hf_file.empty() || params.hf_file[i].empty()) {
auto auto_detected = common_get_hf_file(params.hf_repo[i], params.hf_token, false);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1);
}
model.name = params.hf_repo[i];
model.hf_repo = auto_detected.repo;
model.hf_file = auto_detected.ggufFile;
} else {
model.hf_file = params.hf_file[i];
}
// step 2: construct the model cache path
std::string clean_fname = model.hf_repo + "_" + model.hf_file;
string_replace_all(clean_fname, "\\", "_");
string_replace_all(clean_fname, "/", "_");
model.path = fs_get_cache_file(clean_fname);
// step 3: download the model if not exists
std::string model_endpoint = get_model_endpoint();
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
bool ok = common_download_model(model, params.hf_token, false);
if (!ok) {
fprintf(stderr, "error: failed to download model from %s\n", model.url.c_str());
exit(1);
}
params.model.push_back(model.path);
}
}
// set defaults
if (params.model.empty()) {
params.model = cmd_params_defaults.model;

View File

@@ -562,7 +562,7 @@ private:
llama_model_ptr model_dft;
bool add_bos_token = true;
bool add_bos_token = true;
int32_t n_ctx; // total context for all clients / slots
@@ -570,6 +570,7 @@ private:
std::vector<server_slot> slots;
int slots_debug = 0;
int n_empty_consequtive = 0;
std::unique_ptr<server_prompt_cache> prompt_cache;
@@ -2438,6 +2439,8 @@ private:
slot.n_prompt_tokens_cache = 0;
}
bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
// check if we should process the image
if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
// process the image
@@ -2457,6 +2460,8 @@ private:
const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
slot.prompt.tokens.push_back(chunk.get()); // copy
}
do_checkpoint = false; // do not checkpoint right after an image chunk
}
// If using an alora, there may be uncached tokens that come
@@ -2473,8 +2478,6 @@ private:
alora_disabled_id = enabled_loras[0];
}
bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
// make checkpoints only for completion tasks
do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION;
@@ -2626,6 +2629,12 @@ private:
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
if (++n_empty_consequtive > 3) {
GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277");
}
} else {
n_empty_consequtive = 0;
}
int32_t i_next = 0;