mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-21 16:34:05 +00:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd544c94a3 | ||
|
|
14be5a39b1 | ||
|
|
fbbf3ad190 | ||
|
|
33f890e579 |
@@ -57,6 +57,17 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
|
||||
throw std::runtime_error("error: invalid URL format");
|
||||
}
|
||||
|
||||
#ifndef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
if (parts.scheme == "https") {
|
||||
throw std::runtime_error(
|
||||
"HTTPS is not supported. Please rebuild with:\n"
|
||||
" -DLLAMA_BUILD_BORINGSSL=ON\n"
|
||||
" -DLLAMA_BUILD_LIBRESSL=ON\n"
|
||||
"or ensure dev files of an OpenSSL-compatible library are available when building."
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
httplib::Client cli(parts.scheme + "://" + parts.host);
|
||||
|
||||
if (!parts.user.empty()) {
|
||||
|
||||
@@ -1518,6 +1518,15 @@ struct vk_quantize_q8_1_push_constants {
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
struct vk_op_flash_attn_split_k_reduce_push_constants {
|
||||
uint32_t D;
|
||||
uint32_t ne1;
|
||||
uint32_t ne2;
|
||||
uint32_t ne3;
|
||||
uint32_t k_num;
|
||||
uint32_t sinks;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
@@ -1804,7 +1813,6 @@ struct ggml_backend_vk_context {
|
||||
bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync;
|
||||
|
||||
vk_context_ref compute_ctx;
|
||||
vk_context_ref transfer_ctx;
|
||||
|
||||
std::vector<vk_context_ref> tensor_ctxs;
|
||||
|
||||
@@ -1814,7 +1822,6 @@ struct ggml_backend_vk_context {
|
||||
uint32_t pipeline_descriptor_set_requirements {};
|
||||
|
||||
vk_command_pool compute_cmd_pool;
|
||||
vk_command_pool transfer_cmd_pool;
|
||||
|
||||
// number of additional consecutive nodes that are being fused with the
|
||||
// node currently being processed
|
||||
@@ -3982,7 +3989,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
|
||||
if (device->subgroup_clustered && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
@@ -5649,7 +5656,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
||||
|
||||
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
|
||||
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
||||
@@ -8457,14 +8463,14 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
GGML_ASSERT(0);
|
||||
}
|
||||
|
||||
if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa &&
|
||||
if (N <= 8 && qk_ratio > 1 && qk_ratio <= max_gqa &&
|
||||
qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) {
|
||||
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
|
||||
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
|
||||
// and change addressing calculations to index Q's dimension 2.
|
||||
gqa_ratio = qk_ratio;
|
||||
N = gqa_ratio;
|
||||
workgroups_y /= N;
|
||||
workgroups_y /= gqa_ratio;
|
||||
}
|
||||
|
||||
bool small_rows = N <= get_fa_num_small_rows(path);
|
||||
@@ -8526,6 +8532,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
|
||||
assert(pipeline);
|
||||
// Compile early to initialize wg_denoms.
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
uint32_t split_kv = KV;
|
||||
uint32_t split_k = 1;
|
||||
@@ -8533,22 +8541,24 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
// Use a placeholder core count if one isn't available. split_k is a big help for perf.
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count : 16;
|
||||
|
||||
// Try to use split_k when KV is large enough to be worth the overhead
|
||||
if (workgroups_x == 1 && shader_core_count > 0) {
|
||||
// Try to use split_k when KV is large enough to be worth the overhead.
|
||||
// Must either be a single batch or be using gqa, we can't mix the two.
|
||||
if (workgroups_x <= pipeline->wg_denoms[0] && (workgroups_x == 1 || gqa_ratio > 1)) {
|
||||
// Try to run two workgroups per SM.
|
||||
split_k = shader_core_count * 2 / (workgroups_y * workgroups_z);
|
||||
split_k = shader_core_count * 2 / (workgroups_x * workgroups_y * workgroups_z);
|
||||
if (split_k > 1) {
|
||||
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
|
||||
// of "align", so recompute split_k based on that.
|
||||
split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment);
|
||||
split_k = CEIL_DIV(KV, split_kv);
|
||||
workgroups_x = split_k;
|
||||
}
|
||||
}
|
||||
|
||||
// Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
|
||||
// and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
|
||||
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
|
||||
// For matrices, the order is (inner to outer) [HSV, ne1, k, ne2, ne3].
|
||||
// For L/M, the order is (inner to outer) [ne1, k, ne2, ne3].
|
||||
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne2 * ne3 : 0;
|
||||
if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||
GGML_ABORT("Requested preallocation size is too large");
|
||||
}
|
||||
@@ -8559,7 +8569,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
{
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
||||
}
|
||||
@@ -8608,7 +8617,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
if (ctx->prealloc_split_k_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
workgroups_x *= pipeline->wg_denoms[0];
|
||||
vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf},
|
||||
@@ -8616,15 +8625,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
// there's no more than one tile of rows (i.e. workgroups_x would have been
|
||||
// one). We reuse workgroups_x to mean the number of splits, so we need to
|
||||
// cancel out the divide by wg_denoms[0].
|
||||
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
||||
pc, { split_k * workgroups_x, workgroups_y, workgroups_z });
|
||||
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
||||
const vk_op_flash_attn_split_k_reduce_push_constants pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{split_k_buf, sinks_buf, dst_buf},
|
||||
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
|
||||
pc2, { (uint32_t)ne1, HSV, (uint32_t)(ne2 * ne3) });
|
||||
ctx->prealloc_split_k_need_sync = true;
|
||||
} else {
|
||||
if (gqa_ratio > 1) {
|
||||
// When using gqa, we want one actual workgroup per batch, so cancel out wg_denoms
|
||||
workgroups_x *= pipeline->wg_denoms[0];
|
||||
}
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf},
|
||||
pc, { workgroups_x, workgroups_y, workgroups_z });
|
||||
@@ -11563,7 +11576,6 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
free(d_chk);
|
||||
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||
|
||||
ggml_vk_destroy_buffer(d_X);
|
||||
ggml_vk_destroy_buffer(d_Y);
|
||||
@@ -12148,7 +12160,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
||||
ggml_vk_submit(subctx, {});
|
||||
ctx->submit_pending = true;
|
||||
ggml_vk_synchronize(ctx);
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
ctx->compute_ctx = subctx;
|
||||
}
|
||||
|
||||
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
||||
@@ -12166,6 +12180,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
||||
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
||||
}
|
||||
ctx->prealloc_y = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_y);
|
||||
ctx->prealloc_y_last_tensor_used = nullptr;
|
||||
}
|
||||
if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
|
||||
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
|
||||
@@ -12746,7 +12761,6 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||
ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
|
||||
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||
|
||||
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
||||
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
||||
@@ -12775,7 +12789,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||
static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")");
|
||||
// discard any unsubmitted command buffers
|
||||
ctx->transfer_ctx.reset();
|
||||
ctx->compute_ctx.reset();
|
||||
// wait for any pending command buffers to finish
|
||||
ggml_vk_synchronize(ctx);
|
||||
|
||||
@@ -12808,7 +12822,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||
ctx->descriptor_sets.clear();
|
||||
|
||||
ctx->compute_cmd_pool.destroy(ctx->device->device);
|
||||
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
||||
if (vk_perf_logger_enabled) {
|
||||
ctx->perf_logger->print_timings(true);
|
||||
}
|
||||
@@ -13080,34 +13093,34 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
|
||||
vk_context transfer_ctx;
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
transfer_ctx = ctx->transfer_ctx.lock();
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
||||
|
||||
bool ret = ggml_vk_buffer_write_async(transfer_ctx, buf, dst_offset, data, size);
|
||||
bool ret = ggml_vk_buffer_write_async(compute_ctx, buf, dst_offset, data, size);
|
||||
|
||||
if (!ret) {
|
||||
ggml_vk_ensure_sync_staging_buffer(ctx, size);
|
||||
ggml_vk_sync_buffers(nullptr, transfer_ctx);
|
||||
ggml_vk_sync_buffers(nullptr, compute_ctx);
|
||||
|
||||
vk::BufferCopy buffer_cpy;
|
||||
buffer_cpy.srcOffset = 0;
|
||||
buffer_cpy.dstOffset = dst_offset;
|
||||
buffer_cpy.size = size;
|
||||
|
||||
transfer_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &transfer_ctx->in_memcpys);
|
||||
compute_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &compute_ctx->in_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
}
|
||||
}
|
||||
@@ -13119,34 +13132,34 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
|
||||
vk_context transfer_ctx;
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
transfer_ctx = ctx->transfer_ctx.lock();
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
||||
bool ret = ggml_vk_buffer_read_async(transfer_ctx, buf, src_offset, data, size);
|
||||
bool ret = ggml_vk_buffer_read_async(compute_ctx, buf, src_offset, data, size);
|
||||
|
||||
// If that failed, copy synchronously through a staging buffer
|
||||
if (!ret) {
|
||||
ggml_vk_ensure_sync_staging_buffer(ctx, size);
|
||||
ggml_vk_sync_buffers(nullptr, transfer_ctx);
|
||||
ggml_vk_sync_buffers(nullptr, compute_ctx);
|
||||
|
||||
vk::BufferCopy buffer_cpy;
|
||||
buffer_cpy.srcOffset = src_offset;
|
||||
buffer_cpy.dstOffset = 0;
|
||||
buffer_cpy.size = size;
|
||||
|
||||
transfer_ctx->s->buffer.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
||||
deferred_memcpy(data, ctx->sync_staging->ptr, size, &transfer_ctx->out_memcpys);
|
||||
compute_ctx->s->buffer.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
||||
deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
}
|
||||
}
|
||||
@@ -13158,21 +13171,21 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
|
||||
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
|
||||
vk_context transfer_ctx;
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
transfer_ctx = ctx->transfer_ctx.lock();
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
vk_buffer src_buf = src_buf_ctx->dev_buffer;
|
||||
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
|
||||
|
||||
ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
|
||||
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -13182,19 +13195,19 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_synchronize()");
|
||||
|
||||
bool do_transfer = !ctx->transfer_ctx.expired();
|
||||
bool do_transfer = !ctx->compute_ctx.expired();
|
||||
|
||||
vk_context transfer_ctx;
|
||||
vk_context compute_ctx;
|
||||
if (do_transfer) {
|
||||
transfer_ctx = ctx->transfer_ctx.lock();
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
|
||||
ggml_vk_ctx_end(transfer_ctx);
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
||||
for (auto& cpy : transfer_ctx->in_memcpys) {
|
||||
for (auto& cpy : compute_ctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(transfer_ctx, {});
|
||||
ggml_vk_submit(compute_ctx, {});
|
||||
ctx->submit_pending = true;
|
||||
}
|
||||
|
||||
@@ -13208,10 +13221,10 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
|
||||
if (do_transfer) {
|
||||
for (auto& cpy : transfer_ctx->out_memcpys) {
|
||||
for (auto& cpy : compute_ctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
ctx->transfer_ctx.reset();
|
||||
ctx->compute_ctx.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13880,6 +13893,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ggml_vk_submit(compute_ctx, ctx->device->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->device->fence });
|
||||
ctx->compute_ctx.reset();
|
||||
|
||||
// Get the results and pass them to the logger
|
||||
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||
@@ -14166,15 +14180,15 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
vk_context transfer_ctx;
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
transfer_ctx = ctx->transfer_ctx.lock();
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
// the backend interface doesn't have an explicit reset, so reset it here
|
||||
@@ -14182,13 +14196,13 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
||||
ctx->device->device.resetEvent(vkev->event);
|
||||
ctx->device->device.resetFences({ vkev->fence });
|
||||
|
||||
ggml_vk_set_event(transfer_ctx, vkev->event);
|
||||
ggml_vk_set_event(compute_ctx, vkev->event);
|
||||
|
||||
ggml_vk_ctx_end(transfer_ctx);
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
||||
ggml_vk_submit(transfer_ctx, {vkev->fence});
|
||||
ggml_vk_submit(compute_ctx, {vkev->fence});
|
||||
ctx->submit_pending = true;
|
||||
ctx->transfer_ctx.reset();
|
||||
ctx->compute_ctx.reset();
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
@@ -14196,20 +14210,20 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
vk_context transfer_ctx;
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
transfer_ctx = ctx->transfer_ctx.lock();
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
ggml_vk_wait_events(transfer_ctx, {vkev->event});
|
||||
ggml_vk_ctx_end(transfer_ctx);
|
||||
ctx->transfer_ctx.reset();
|
||||
ggml_vk_wait_events(compute_ctx, {vkev->event});
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
ctx->compute_ctx.reset();
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
|
||||
@@ -53,7 +53,7 @@ void main() {
|
||||
const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
|
||||
const uint32_t col_tid = gl_LocalInvocationIndex / D_split;
|
||||
|
||||
uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
|
||||
uint32_t q_offset = gqa_iq1*p.nb01 + (iq2*p.nb02 + iq3*p.nb03) / 4;
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (HSK / 4);
|
||||
@@ -101,9 +101,9 @@ void main() {
|
||||
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
|
||||
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
|
||||
#endif
|
||||
uint32_t m_offset = 0;
|
||||
uint32_t m_offset = gqa_iq1*KV;
|
||||
if (p.nem2 != 1 || p.nem3 != 1) {
|
||||
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
|
||||
m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
@@ -320,7 +320,8 @@ void main() {
|
||||
// If there is split_k, then the split_k resolve shader does the final
|
||||
// division by L. Store the intermediate O value and per-row m and L values.
|
||||
if (p.k_num > 1) {
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
||||
// note: O and Q have swapped coord 1,2.
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
|
||||
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
if (r < N) {
|
||||
@@ -332,7 +333,7 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
||||
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
if (r < N) {
|
||||
perElemOpStoreCol0(r, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
|
||||
@@ -378,7 +379,7 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
||||
uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV;
|
||||
|
||||
if (p.gqa_ratio > 1) {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
|
||||
@@ -165,7 +165,7 @@ ACC_TYPE perElemOpGetSink(const in uint32_t r, const in uint32_t c, const in ACC
|
||||
}
|
||||
|
||||
uint32_t i, N, KV, split_k_index, Tr, start_j, end_j,
|
||||
iq2, iq3, rk2, rk3, rv2, rv3, ik2, ik3, iv2, iv3,
|
||||
gqa_iq1, iq2, iq3, rk2, rk3, rv2, rv3, ik2, ik3, iv2, iv3,
|
||||
q_stride, k_stride, v_stride, m_stride;
|
||||
|
||||
void init_indices()
|
||||
@@ -173,12 +173,19 @@ void init_indices()
|
||||
N = p.N;
|
||||
KV = p.KV;
|
||||
|
||||
i = gl_WorkGroupID.x;
|
||||
split_k_index = 0;
|
||||
|
||||
if (p.k_num > 1) {
|
||||
i = 0;
|
||||
split_k_index = gl_WorkGroupID.x;
|
||||
// batch and split_k share gl_WorkGroupID.x
|
||||
gqa_iq1 = gl_WorkGroupID.x / p.k_num;
|
||||
split_k_index = gl_WorkGroupID.x % p.k_num;
|
||||
} else if (p.gqa_ratio > 1) {
|
||||
i = 0;
|
||||
gqa_iq1 = gl_WorkGroupID.x;
|
||||
split_k_index = 0;
|
||||
} else {
|
||||
i = gl_WorkGroupID.x;
|
||||
gqa_iq1 = 0;
|
||||
split_k_index = 0;
|
||||
}
|
||||
|
||||
Tr = CEIL_DIV(N, Br);
|
||||
|
||||
@@ -90,7 +90,7 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
|
||||
uint32_t q_offset = gqa_iq1*p.nb01 + (iq2*p.nb02+iq3*p.nb03) / 4;
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (HSK / 4);
|
||||
@@ -141,9 +141,9 @@ void main() {
|
||||
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
|
||||
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
|
||||
#endif
|
||||
uint32_t m_offset = 0;
|
||||
uint32_t m_offset = gqa_iq1*KV;
|
||||
if (p.nem2 != 1 || p.nem3 != 1) {
|
||||
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
|
||||
m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
@@ -370,7 +370,8 @@ void main() {
|
||||
// If there is split_k, then the split_k resolve shader does the final
|
||||
// division by L. Store the intermediate O value and per-row m and L values.
|
||||
if (p.k_num > 1) {
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
||||
// note: O and Q have swapped coord 1,2.
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
|
||||
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
if (tile_row(r) < N) {
|
||||
@@ -382,7 +383,7 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
||||
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
if (tile_row(r) < N) {
|
||||
perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
|
||||
@@ -428,7 +429,7 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
||||
uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV;
|
||||
|
||||
if (p.gqa_ratio > 1) {
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
|
||||
@@ -111,7 +111,7 @@ void main() {
|
||||
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
|
||||
|
||||
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
|
||||
uint32_t q_offset = gqa_iq1*p.nb01*4/*sizeof(float)*/ + iq2*p.nb02+iq3*p.nb03;
|
||||
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
|
||||
|
||||
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
|
||||
@@ -138,9 +138,9 @@ void main() {
|
||||
coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2);
|
||||
}
|
||||
|
||||
uint32_t m_offset = 0;
|
||||
uint32_t m_offset = gqa_iq1*KV * 2 /*sizeof(float16_t)*/;
|
||||
if (p.nem2 != 1 || p.nem3 != 1) {
|
||||
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/;
|
||||
m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/;
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
@@ -272,10 +272,11 @@ void main() {
|
||||
if (p.k_num > 1) {
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
|
||||
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
||||
// note: O and Q have swapped coord 1,2.
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
|
||||
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
||||
|
||||
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
||||
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
|
||||
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
|
||||
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
|
||||
return;
|
||||
@@ -325,7 +326,7 @@ void main() {
|
||||
[[unroll]] for (uint i = 0; i < O.length(); ++i) { O[i] = clamp(O[i], -ACC_TYPE_MAX, ACC_TYPE_MAX); }
|
||||
#endif
|
||||
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
||||
uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV;
|
||||
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
|
||||
if (p.gqa_ratio > 1) {
|
||||
|
||||
@@ -12,7 +12,8 @@ layout (binding = 2) writeonly buffer D {float data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint D;
|
||||
uint N;
|
||||
uint ne1;
|
||||
uint ne2;
|
||||
uint ne3;
|
||||
uint k_num;
|
||||
uint sinks;
|
||||
@@ -24,15 +25,15 @@ void main() {
|
||||
// Each workgroup handles a row
|
||||
const uint n = gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint iq3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.z % p.ne2;
|
||||
const uint i3 = gl_WorkGroupID.z / p.ne2;
|
||||
|
||||
uint D = p.D;
|
||||
uint N = p.N;
|
||||
uint k_num = p.k_num;
|
||||
|
||||
uint l_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + n;
|
||||
uint m_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + N + n;
|
||||
uint lm_stride = N * 2;
|
||||
uint l_offset = D * p.ne1 * p.ne2 * p.ne3 * k_num + p.ne1 * 2 * (0/*split_k_index*/ + p.k_num * (i2 + p.ne2 * i3)) + n;
|
||||
uint m_offset = D * p.ne1 * p.ne2 * p.ne3 * k_num + p.ne1 * 2 * (0/*split_k_index*/ + p.k_num * (i2 + p.ne2 * i3)) + p.ne1 + n;
|
||||
uint lm_stride = p.ne1 * 2;
|
||||
|
||||
// Compute the max m value for the row
|
||||
float m_max = -1.0/0.0;
|
||||
@@ -99,7 +100,7 @@ void main() {
|
||||
if (d < D) {
|
||||
float O = 0.0;
|
||||
[[unroll]] for (uint k = 0; k < k_num; ++k) {
|
||||
uint o_offset = D * N * (k + iq3 * k_num) + D * n + d;
|
||||
uint o_offset = D * p.ne1 * (k + p.k_num * (i2 + p.ne2 * i3)) + D * n + d;
|
||||
float m = data_a[m_offset + k * lm_stride];
|
||||
O += exp(m - m_max) * data_a[o_offset];
|
||||
}
|
||||
@@ -115,6 +116,6 @@ void main() {
|
||||
const float FLT_MAX = uintBitsToFloat(0x7F7FFFFF);
|
||||
O = clamp(O, -FLT_MAX, FLT_MAX);
|
||||
|
||||
data_d[iq3 * D * N + D * n + d] = O;
|
||||
data_d[(i3 * p.ne2 + i2) * p.ne1 * D + D * n + d] = O;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ pytest~=8.3.3
|
||||
huggingface_hub>=0.34.0,<1.0
|
||||
matplotlib~=3.10.0
|
||||
numpy~=1.26.4
|
||||
openai~=1.55.3
|
||||
openai~=2.14.0
|
||||
pandas~=2.2.3
|
||||
prometheus-client~=0.20.0
|
||||
requests~=2.32.3
|
||||
|
||||
@@ -8460,6 +8460,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
// Qwen3-VL-8B https://github.com/ggml-org/llama.cpp/issues/17012
|
||||
test_cases.emplace_back(new test_flash_attn_ext(72, 72, 16, {1, 1}, 5776, 5776, false, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
|
||||
test_cases.emplace_back(new test_flash_attn_ext(64, 64, 8, {8, 1}, 7680, 1, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
test_cases.emplace_back(new test_flash_attn_ext(64, 64, 8, {8, 1}, 7680, 4, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
|
||||
for (int kv : { 4096, 8192, 16384, }) {
|
||||
for (int hs : { 64, 128, }) {
|
||||
for (int nr : { 1, 4, }) {
|
||||
|
||||
@@ -6,7 +6,7 @@ Set of LLM REST APIs and a web UI to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
|
||||
* [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions
|
||||
* Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
|
||||
* Parallel decoding with multi-user support
|
||||
@@ -1267,6 +1267,49 @@ This provides information on the performance of the server. It also allows calcu
|
||||
|
||||
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
||||
|
||||
### POST `/v1/responses`: OpenAI-compatible Responses API
|
||||
|
||||
*Options:*
|
||||
|
||||
See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
|
||||
|
||||
*Examples:*
|
||||
|
||||
You can use either Python `openai` library with appropriate checkpoints:
|
||||
|
||||
```python
|
||||
import openai
|
||||
|
||||
client = openai.OpenAI(
|
||||
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||
api_key = "sk-no-key-required"
|
||||
)
|
||||
|
||||
response = client.responses.create(
|
||||
model="gpt-4.1",
|
||||
instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
|
||||
input="Write a limerick about python exceptions"
|
||||
)
|
||||
|
||||
print(response.output_text)
|
||||
```
|
||||
|
||||
... or raw HTTP requests:
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/responses \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"model": "gpt-4.1",
|
||||
"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
|
||||
"input": "Write a limerick about python exceptions"
|
||||
}'
|
||||
```
|
||||
|
||||
This endpoint works by converting Responses request into Chat Completions request.
|
||||
|
||||
|
||||
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||
|
||||
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
||||
|
||||
@@ -1069,6 +1069,283 @@ json oaicompat_chat_params_parse(
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
json convert_responses_to_chatcmpl(const json & response_body) {
|
||||
if (!response_body.contains("input")) {
|
||||
throw std::invalid_argument("'input' is required");
|
||||
}
|
||||
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
|
||||
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
|
||||
}
|
||||
|
||||
const json input_value = response_body.at("input");
|
||||
json chatcmpl_body = response_body;
|
||||
chatcmpl_body.erase("input");
|
||||
std::vector<json> chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("instructions")) {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", json_value(response_body, "instructions", std::string())},
|
||||
});
|
||||
chatcmpl_body.erase("instructions");
|
||||
}
|
||||
|
||||
if (input_value.is_string()) {
|
||||
// #responses_create-input-text_input
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", input_value},
|
||||
});
|
||||
} else if (input_value.is_array()) {
|
||||
// #responses_create-input-input_item_list
|
||||
|
||||
static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_array();
|
||||
};
|
||||
static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_string();
|
||||
};
|
||||
|
||||
for (json item : input_value) {
|
||||
if (exists_and_is_string(item, "content")) {
|
||||
// #responses_create-input-input_item_list-input_message-content-text_input
|
||||
// Only "Input message" contains item["content"]::string
|
||||
// After converting item["content"]::string to item["content"]::array,
|
||||
// we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
|
||||
item["content"] = json::array({
|
||||
json {
|
||||
{"text", item.at("content")},
|
||||
{"type", "input_text"}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
(item.at("role") == "user" ||
|
||||
item.at("role") == "system" ||
|
||||
item.at("role") == "developer")
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-input_message
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const json & input_item : item.at("content")) {
|
||||
const std::string type = json_value(input_item, "type", std::string());
|
||||
|
||||
if (type == "input_text") {
|
||||
if (!input_item.contains("text")) {
|
||||
throw std::invalid_argument("'Input text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", input_item.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
} else if (type == "input_image") {
|
||||
// While `detail` is marked as required,
|
||||
// it has default value("auto") and can be omitted.
|
||||
|
||||
if (!input_item.contains("image_url")) {
|
||||
throw std::invalid_argument("'image_url' is required");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"image_url", json {
|
||||
{"url", input_item.at("image_url")}
|
||||
}},
|
||||
{"type", "image_url"},
|
||||
});
|
||||
} else if (type == "input_file") {
|
||||
throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
|
||||
// if (input_item.contains("file_url")) {
|
||||
// // chat completion API does not support file_url
|
||||
// throw std::invalid_argument("'file_url' is not supported");
|
||||
// }
|
||||
// if (!input_item.contains("file_data") || !input_item.contains("filename")) {
|
||||
// throw std::invalid_argument("Both 'file_data' and 'filename' are required");
|
||||
// }
|
||||
// chatcmpl_content.push_back({
|
||||
// {"file", json {
|
||||
// {"file_data", input_item.at("file_data")},
|
||||
// {"filename", input_item.at("filename")},
|
||||
// }},
|
||||
// {"type", "file"},
|
||||
// });
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
|
||||
}
|
||||
}
|
||||
|
||||
if (item.contains("type")) {
|
||||
item.erase("type");
|
||||
}
|
||||
if (item.contains("status")) {
|
||||
item.erase("status");
|
||||
}
|
||||
item["content"] = chatcmpl_content;
|
||||
|
||||
chatcmpl_messages.push_back(item);
|
||||
} else if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
item.at("role") == "assistant" &&
|
||||
// exists_and_is_string(item, "status") &&
|
||||
// (item.at("status") == "in_progress" ||
|
||||
// item.at("status") == "completed" ||
|
||||
// item.at("status") == "incomplete") &&
|
||||
// item["status"] not sent by codex-cli
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "message"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-output_message
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const auto & output_text : item.at("content")) {
|
||||
const std::string type = json_value(output_text, "type", std::string());
|
||||
if (type != "output_text") {
|
||||
throw std::invalid_argument("'type' must be 'output_text'");
|
||||
}
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
}
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
|
||||
item.erase("status");
|
||||
item.erase("type");
|
||||
item["content"] = chatcmpl_content;
|
||||
chatcmpl_messages.push_back(item);
|
||||
} else if (exists_and_is_string(item, "arguments") &&
|
||||
exists_and_is_string(item, "call_id") &&
|
||||
exists_and_is_string(item, "name") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-function_tool_call
|
||||
json msg = json {
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", json::array({ json {
|
||||
{"function", json {
|
||||
{"arguments", item.at("arguments")},
|
||||
{"name", item.at("name")},
|
||||
}},
|
||||
{"id", item.at("call_id")},
|
||||
{"type", "function"},
|
||||
}})},
|
||||
};
|
||||
|
||||
if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
|
||||
// Move reasoning content from dummy message to tool call message
|
||||
msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
|
||||
chatcmpl_messages.pop_back();
|
||||
}
|
||||
chatcmpl_messages.push_back(msg);
|
||||
} else if (exists_and_is_string(item, "call_id") &&
|
||||
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call_output"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-function_tool_call_output
|
||||
if (item.at("output").is_string()) {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"content", item.at("output")},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
} else {
|
||||
json chatcmpl_outputs = item.at("output");
|
||||
for (json & chatcmpl_output : chatcmpl_outputs) {
|
||||
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
|
||||
throw std::invalid_argument("Output of tool call should be 'Input text'");
|
||||
}
|
||||
chatcmpl_output["type"] = "text";
|
||||
}
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"content", chatcmpl_outputs},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
}
|
||||
} else if (// exists_and_is_string(item, "id") &&
|
||||
// item["id"] not sent by codex-cli
|
||||
exists_and_is_array(item, "summary") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "reasoning") {
|
||||
// #responses_create-input-input_item_list-item-reasoning
|
||||
|
||||
if (!exists_and_is_array(item, "content")) {
|
||||
throw std::invalid_argument("item['content'] is not an array");
|
||||
}
|
||||
if (item.at("content").empty()) {
|
||||
throw std::invalid_argument("item['content'] is empty");
|
||||
}
|
||||
if (!exists_and_is_string(item.at("content")[0], "text")) {
|
||||
throw std::invalid_argument("item['content']['text'] is not a string");
|
||||
}
|
||||
|
||||
// Pack reasoning content in dummy message
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"role", "assistant"},
|
||||
{"content", json::array()},
|
||||
{"reasoning_content", item.at("content")[0].at("text")},
|
||||
});
|
||||
} else {
|
||||
throw std::invalid_argument("Cannot determine type of 'item'");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("'input' must be a string or array of objects");
|
||||
}
|
||||
|
||||
// Remove unused dummy message which contains
|
||||
// reasoning content not followed by tool call
|
||||
chatcmpl_messages.erase(std::remove_if(
|
||||
chatcmpl_messages.begin(),
|
||||
chatcmpl_messages.end(),
|
||||
[](const json & x){ return x.contains("role") &&
|
||||
x.at("role") == "assistant" &&
|
||||
x.contains("content") &&
|
||||
x.at("content") == json::array() &&
|
||||
x.contains("reasoning_content");
|
||||
}),
|
||||
chatcmpl_messages.end()
|
||||
);
|
||||
|
||||
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("tools")) {
|
||||
if (!response_body.at("tools").is_array()) {
|
||||
throw std::invalid_argument("'tools' must be an array of objects");
|
||||
}
|
||||
std::vector<json> chatcmpl_tools;
|
||||
for (json resp_tool : response_body.at("tools")) {
|
||||
json chatcmpl_tool;
|
||||
|
||||
if (json_value(resp_tool, "type", std::string()) != "function") {
|
||||
throw std::invalid_argument("'type' of tool must be 'function'");
|
||||
}
|
||||
resp_tool.erase("type");
|
||||
chatcmpl_tool["type"] = "function";
|
||||
|
||||
if (!resp_tool.contains("strict")) {
|
||||
resp_tool["strict"] = true;
|
||||
}
|
||||
chatcmpl_tool["function"] = resp_tool;
|
||||
chatcmpl_tools.push_back(chatcmpl_tool);
|
||||
}
|
||||
chatcmpl_body.erase("tools");
|
||||
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||
}
|
||||
|
||||
if (response_body.contains("max_output_tokens")) {
|
||||
chatcmpl_body.erase("max_output_tokens");
|
||||
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
json convert_anthropic_to_oai(const json & body) {
|
||||
json oai_body;
|
||||
|
||||
@@ -1482,6 +1759,24 @@ std::string format_oai_sse(const json & data) {
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string format_oai_resp_sse(const json & data) {
|
||||
std::ostringstream ss;
|
||||
auto send_single = [&ss](const json & event_obj) {
|
||||
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
|
||||
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
|
||||
};
|
||||
|
||||
if (data.is_array()) {
|
||||
for (const auto & item : data) {
|
||||
send_single(item);
|
||||
}
|
||||
} else {
|
||||
send_single(data);
|
||||
}
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string format_anthropic_sse(const json & data) {
|
||||
std::ostringstream ss;
|
||||
|
||||
|
||||
@@ -294,6 +294,9 @@ json oaicompat_chat_params_parse(
|
||||
const server_chat_params & opt,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||
json convert_responses_to_chatcmpl(const json & body);
|
||||
|
||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||
json convert_anthropic_to_oai(const json & body);
|
||||
|
||||
@@ -331,6 +334,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
|
||||
// note: if data is a json array, it will be sent as multiple events, one per item
|
||||
std::string format_oai_sse(const json & data);
|
||||
|
||||
std::string format_oai_resp_sse(const json & data);
|
||||
|
||||
// format Anthropic-style SSE with event types
|
||||
std::string format_anthropic_sse(const json & data);
|
||||
|
||||
|
||||
@@ -3073,6 +3073,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
json first_result_json = first_result->to_json();
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
res->data = format_anthropic_sse(first_result_json);
|
||||
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||
res->data = format_oai_resp_sse(first_result_json);
|
||||
} else {
|
||||
res->data = format_oai_sse(first_result_json);
|
||||
}
|
||||
@@ -3107,13 +3109,16 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
|
||||
// check if there is more data
|
||||
if (!rd.has_next()) {
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
// Anthropic doesn't send [DONE], message_stop was already sent
|
||||
output = "";
|
||||
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
|
||||
output = "data: [DONE]\n\n";
|
||||
} else {
|
||||
output = "";
|
||||
switch (res_type) {
|
||||
case TASK_RESPONSE_TYPE_NONE:
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
output = "";
|
||||
break;
|
||||
|
||||
default:
|
||||
output = "data: [DONE]\n\n";
|
||||
break;
|
||||
}
|
||||
SRV_DBG("%s", "all results received, terminating stream\n");
|
||||
return false; // no more data, terminate
|
||||
@@ -3141,6 +3146,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
json res_json = result->to_json();
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
output = format_anthropic_sse(res_json);
|
||||
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||
output = format_oai_resp_sse(res_json);
|
||||
} else {
|
||||
output = format_oai_sse(res_json);
|
||||
}
|
||||
@@ -3575,6 +3582,22 @@ void server_routes::init_routes() {
|
||||
TASK_RESPONSE_TYPE_OAI_CHAT);
|
||||
};
|
||||
|
||||
this->post_responses_oai = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = convert_responses_to_chatcmpl(json::parse(req.body));
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
meta->chat_params,
|
||||
files);
|
||||
return handle_completions_impl(
|
||||
req,
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
body_parsed,
|
||||
files,
|
||||
TASK_RESPONSE_TYPE_OAI_RESP);
|
||||
};
|
||||
|
||||
this->post_anthropic_messages = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
|
||||
@@ -94,6 +94,7 @@ struct server_routes {
|
||||
server_http_context::handler_t post_completions;
|
||||
server_http_context::handler_t post_completions_oai;
|
||||
server_http_context::handler_t post_chat_completions;
|
||||
server_http_context::handler_t post_responses_oai;
|
||||
server_http_context::handler_t post_anthropic_messages;
|
||||
server_http_context::handler_t post_anthropic_count_tokens;
|
||||
server_http_context::handler_t post_apply_template;
|
||||
|
||||
@@ -584,6 +584,8 @@ json server_task_result_cmpl_final::to_json() {
|
||||
return to_json_oaicompat();
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
||||
default:
|
||||
@@ -801,6 +803,186 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
||||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
||||
common_chat_msg msg;
|
||||
if (!oaicompat_msg.empty()) {
|
||||
msg = oaicompat_msg;
|
||||
} else {
|
||||
msg.role = "assistant";
|
||||
msg.content = content;
|
||||
}
|
||||
|
||||
std::vector<json> output;
|
||||
|
||||
if (msg.reasoning_content != "") {
|
||||
output.push_back(json {
|
||||
{"id", "rs_" + random_string()},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array({ json {
|
||||
{"text", msg.reasoning_content},
|
||||
{"type", "reasoning_text"},
|
||||
}})},
|
||||
{"encrypted_content", ""},
|
||||
{"status", "completed"},
|
||||
});
|
||||
}
|
||||
|
||||
if (msg.content != "") {
|
||||
output.push_back(json {
|
||||
{"content", json::array({ json {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", msg.content},
|
||||
}})},
|
||||
{"id", "msg_" + random_string()},
|
||||
{"role", msg.role},
|
||||
{"status", "completed"},
|
||||
{"type", "message"},
|
||||
});
|
||||
}
|
||||
|
||||
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||
output.push_back(json {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name},
|
||||
});
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
json res = {
|
||||
{"completed_at", t},
|
||||
{"created_at", t},
|
||||
{"id", oai_resp_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "response"},
|
||||
{"output", output},
|
||||
{"status", "completed"},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}},
|
||||
};
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||
std::vector<json> server_sent_events;
|
||||
std::vector<json> output;
|
||||
|
||||
if (oaicompat_msg.reasoning_content != "") {
|
||||
const json output_item = json {
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array({ json {
|
||||
{"text", oaicompat_msg.reasoning_content},
|
||||
{"type", "reasoning_text"},
|
||||
}})},
|
||||
{"encrypted_content", ""},
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
if (oaicompat_msg.content != "") {
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_text.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"text", oaicompat_msg.content}
|
||||
}}
|
||||
});
|
||||
|
||||
const json content_part = {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", oaicompat_msg.content}
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.content_part.done"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", content_part}
|
||||
}}
|
||||
});
|
||||
const json output_item = {
|
||||
{"type", "message"},
|
||||
{"status", "completed"},
|
||||
{"id", oai_resp_message_id},
|
||||
{"content", json::array({content_part})},
|
||||
{"role", "assistant"}
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||
const json output_item = {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name}
|
||||
};
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.completed"},
|
||||
{"data", json {
|
||||
{"type", "response.completed"},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"created_at", t},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", output},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||
}}
|
||||
}},
|
||||
}}
|
||||
});
|
||||
|
||||
return server_sent_events;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_anthropic() {
|
||||
std::string stop_reason = "max_tokens";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
@@ -1057,6 +1239,36 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||
//
|
||||
// server_task_result_cmpl_partial
|
||||
//
|
||||
void server_task_result_cmpl_partial::update(task_result_state & state) {
|
||||
is_updated = true;
|
||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||
|
||||
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
|
||||
thinking_block_started = state.thinking_block_started;
|
||||
text_block_started = state.text_block_started;
|
||||
|
||||
oai_resp_id = state.oai_resp_id;
|
||||
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
||||
oai_resp_message_id = state.oai_resp_message_id;
|
||||
oai_resp_fc_id = state.oai_resp_fc_id;
|
||||
|
||||
// track if the accumulated message has any reasoning content
|
||||
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
||||
|
||||
// Pre-compute state updates based on diffs (for next chunk)
|
||||
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
|
||||
state.thinking_block_started = true;
|
||||
}
|
||||
if (!diff.content_delta.empty() && !state.text_block_started) {
|
||||
state.text_block_started = true;
|
||||
}
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
state.oai_resp_fc_id = diff.tool_call_delta.id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json() {
|
||||
GGML_ASSERT(is_updated && "update() must be called before to_json()");
|
||||
switch (res_type) {
|
||||
@@ -1066,6 +1278,8 @@ json server_task_result_cmpl_partial::to_json() {
|
||||
return to_json_oaicompat();
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
return to_json_oaicompat_chat();
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
return to_json_oaicompat_resp();
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
return to_json_anthropic();
|
||||
default:
|
||||
@@ -1190,6 +1404,132 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
||||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
|
||||
std::vector<json> events;
|
||||
|
||||
if (n_decoded == 1) {
|
||||
events.push_back(json {
|
||||
{"event", "response.created"},
|
||||
{"data", json {
|
||||
{"type", "response.created"},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
events.push_back(json {
|
||||
{"event", "response.in_progress"},
|
||||
{"data", json {
|
||||
{"type", "response.in_progress"},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
if (!thinking_block_started) {
|
||||
events.push_back(json {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array()},
|
||||
{"encrypted_content", ""},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
thinking_block_started = true;
|
||||
}
|
||||
events.push_back(json {
|
||||
{"event", "response.reasoning_text.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.reasoning_text.delta"},
|
||||
{"delta", diff.reasoning_content_delta},
|
||||
{"item_id", oai_resp_reasoning_id},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (!text_block_started) {
|
||||
events.push_back(json {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"content", json::array()},
|
||||
{"id", oai_resp_message_id},
|
||||
{"role", "assistant"},
|
||||
{"status", "in_progress"},
|
||||
{"type", "message"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
events.push_back(json {
|
||||
{"event", "response.content_part.added"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.added"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", json {
|
||||
{"type", "output_text"},
|
||||
{"text", ""},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
text_block_started = true;
|
||||
}
|
||||
events.push_back(json {
|
||||
{"event", "response.output_text.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.delta"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"delta", diff.content_delta},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
events.push_back(json {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"arguments", ""},
|
||||
{"call_id", "fc_" + diff.tool_call_delta.id},
|
||||
{"name", diff.tool_call_delta.name},
|
||||
{"type", "function_call"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
oai_resp_fc_id = diff.tool_call_delta.id;
|
||||
}
|
||||
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
events.push_back(json {
|
||||
{"event", "response.function_call_arguments.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.function_call_arguments.delta"},
|
||||
{"delta", diff.tool_call_delta.arguments},
|
||||
{"item_id", "fc_" + oai_resp_fc_id},
|
||||
}},
|
||||
});
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
//
|
||||
// server_task_result_embd
|
||||
//
|
||||
@@ -1260,8 +1600,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||
|
||||
// use local copies of streaming state (copied from task_result_state in update())
|
||||
// these reflect the state BEFORE this chunk was processed
|
||||
bool thinking_started = anthropic_thinking_block_started;
|
||||
bool text_started = anthropic_text_block_started;
|
||||
bool thinking_started = thinking_block_started;
|
||||
bool text_started = text_block_started;
|
||||
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
// handle thinking/reasoning content
|
||||
|
||||
@@ -33,6 +33,7 @@ enum task_response_type {
|
||||
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
||||
TASK_RESPONSE_TYPE_OAI_CHAT,
|
||||
TASK_RESPONSE_TYPE_OAI_CMPL,
|
||||
TASK_RESPONSE_TYPE_OAI_RESP,
|
||||
TASK_RESPONSE_TYPE_OAI_EMBD,
|
||||
TASK_RESPONSE_TYPE_ANTHROPIC,
|
||||
};
|
||||
@@ -98,12 +99,22 @@ struct task_result_state {
|
||||
std::string generated_text; // append new chunks of generated text here
|
||||
std::vector<std::string> generated_tool_call_ids;
|
||||
|
||||
// for Anthropic API streaming: track content block state across chunks
|
||||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
// for OpenAI Responses and Anthropic streaming API:
|
||||
// track output item / content block state across chunks
|
||||
bool thinking_block_started = false;
|
||||
bool text_block_started = false;
|
||||
|
||||
// for OpenAI Responses streaming API
|
||||
const std::string oai_resp_id;
|
||||
const std::string oai_resp_reasoning_id;
|
||||
const std::string oai_resp_message_id;
|
||||
std::string oai_resp_fc_id; // function call ID for current args delta
|
||||
|
||||
task_result_state(const common_chat_parser_params & chat_parser_params)
|
||||
: chat_parser_params(chat_parser_params) {}
|
||||
: chat_parser_params(chat_parser_params)
|
||||
, oai_resp_id("resp_" + random_string())
|
||||
, oai_resp_reasoning_id("rs_" + random_string())
|
||||
, oai_resp_message_id("msg_" + random_string()) {}
|
||||
|
||||
// parse partial tool calls and update the internal state
|
||||
common_chat_msg update_chat_msg(
|
||||
@@ -352,6 +363,11 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
||||
bool is_updated = false;
|
||||
|
||||
// for OpenAI Responses API
|
||||
std::string oai_resp_id;
|
||||
std::string oai_resp_reasoning_id;
|
||||
std::string oai_resp_message_id;
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return true; // in stream mode, final responses are considered stop
|
||||
}
|
||||
@@ -361,6 +377,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
virtual void update(task_result_state & state) override {
|
||||
is_updated = true;
|
||||
oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
|
||||
|
||||
oai_resp_id = state.oai_resp_id;
|
||||
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
||||
oai_resp_message_id = state.oai_resp_message_id;
|
||||
}
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
@@ -371,6 +391,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
|
||||
json to_json_oaicompat_chat_stream();
|
||||
|
||||
json to_json_oaicompat_resp();
|
||||
|
||||
json to_json_oaicompat_resp_stream();
|
||||
|
||||
json to_json_anthropic();
|
||||
|
||||
json to_json_anthropic_stream();
|
||||
@@ -397,45 +421,35 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
||||
bool is_updated = false;
|
||||
|
||||
// Streaming state copied from task_result_state for this chunk
|
||||
bool thinking_block_started = false;
|
||||
bool text_block_started = false;
|
||||
|
||||
// for OpenAI Responses API
|
||||
std::string oai_resp_id;
|
||||
std::string oai_resp_reasoning_id;
|
||||
std::string oai_resp_message_id;
|
||||
std::string oai_resp_fc_id;
|
||||
|
||||
// for Anthropic API: track if any reasoning content has been generated
|
||||
bool anthropic_has_reasoning = false;
|
||||
// Streaming state copied from task_result_state for this chunk
|
||||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return false; // in stream mode, partial responses are not considered stop
|
||||
}
|
||||
|
||||
virtual void update(task_result_state & state) override;
|
||||
|
||||
virtual json to_json() override;
|
||||
|
||||
virtual void update(task_result_state & state) override {
|
||||
is_updated = true;
|
||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||
// track if the accumulated message has any reasoning content
|
||||
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
||||
|
||||
// Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
|
||||
anthropic_thinking_block_started = state.anthropic_thinking_block_started;
|
||||
anthropic_text_block_started = state.anthropic_text_block_started;
|
||||
|
||||
// Pre-compute state updates based on diffs (for next chunk)
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
|
||||
state.anthropic_thinking_block_started = true;
|
||||
}
|
||||
if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
|
||||
state.anthropic_text_block_started = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
|
||||
json to_json_oaicompat();
|
||||
|
||||
json to_json_oaicompat_chat();
|
||||
|
||||
json to_json_oaicompat_resp();
|
||||
|
||||
json to_json_anthropic();
|
||||
};
|
||||
|
||||
|
||||
@@ -140,6 +140,7 @@ int main(int argc, char ** argv) {
|
||||
routes.post_completions = models_routes->proxy_post;
|
||||
routes.post_completions_oai = models_routes->proxy_post;
|
||||
routes.post_chat_completions = models_routes->proxy_post;
|
||||
routes.post_responses_oai = models_routes->proxy_post;
|
||||
routes.post_anthropic_messages = models_routes->proxy_post;
|
||||
routes.post_anthropic_count_tokens = models_routes->proxy_post;
|
||||
routes.post_infill = models_routes->proxy_post;
|
||||
@@ -176,6 +177,7 @@ int main(int argc, char ** argv) {
|
||||
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
||||
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
|
||||
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
||||
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
||||
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
||||
|
||||
@@ -2,7 +2,7 @@ aiohttp~=3.9.3
|
||||
pytest~=8.3.3
|
||||
huggingface_hub>=0.34.0,<1.0
|
||||
numpy~=1.26.4
|
||||
openai~=1.55.3
|
||||
openai~=2.14.0
|
||||
prometheus-client~=0.20.0
|
||||
requests~=2.32.3
|
||||
wget~=3.2
|
||||
|
||||
73
tools/server/tests/unit/test_compat_oai_responses.py
Normal file
73
tools/server/tests/unit/test_compat_oai_responses.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
def test_responses_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||
res = client.responses.create(
|
||||
model="gpt-4.1",
|
||||
input=[
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
max_output_tokens=8,
|
||||
temperature=0.8,
|
||||
)
|
||||
assert res.id.startswith("resp_")
|
||||
assert res.output[0].id is not None
|
||||
assert res.output[0].id.startswith("msg_")
|
||||
assert match_regex("(Suddenly)+", res.output_text)
|
||||
|
||||
def test_responses_stream_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||
stream = client.responses.create(
|
||||
model="gpt-4.1",
|
||||
input=[
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
max_output_tokens=8,
|
||||
temperature=0.8,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
gathered_text = ''
|
||||
resp_id = ''
|
||||
msg_id = ''
|
||||
for r in stream:
|
||||
if r.type == "response.created":
|
||||
assert r.response.id.startswith("resp_")
|
||||
resp_id = r.response.id
|
||||
if r.type == "response.in_progress":
|
||||
assert r.response.id == resp_id
|
||||
if r.type == "response.output_item.added":
|
||||
assert r.item.id is not None
|
||||
assert r.item.id.startswith("msg_")
|
||||
msg_id = r.item.id
|
||||
if (r.type == "response.content_part.added" or
|
||||
r.type == "response.output_text.delta" or
|
||||
r.type == "response.output_text.done" or
|
||||
r.type == "response.content_part.done"):
|
||||
assert r.item_id == msg_id
|
||||
if r.type == "response.output_item.done":
|
||||
assert r.item.id == msg_id
|
||||
|
||||
if r.type == "response.output_text.delta":
|
||||
gathered_text += r.delta
|
||||
if r.type == "response.completed":
|
||||
assert r.response.id.startswith("resp_")
|
||||
assert r.response.output[0].id is not None
|
||||
assert r.response.output[0].id.startswith("msg_")
|
||||
assert gathered_text == r.response.output_text
|
||||
assert match_regex("(Suddenly)+", r.response.output_text)
|
||||
2
vendor/cpp-httplib/CMakeLists.txt
vendored
2
vendor/cpp-httplib/CMakeLists.txt
vendored
@@ -142,7 +142,7 @@ elseif (LLAMA_OPENSSL)
|
||||
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "OpenSSL not found, SSL support disabled")
|
||||
message(WARNING "OpenSSL not found, HTTPS support disabled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user