Compare commits

...

3 Commits

Author SHA1 Message Date
Georgi Gerganov
c6a04cb5c3 ggml-metal: fix 2D async copy to use row-by-row transfers
The MTLBlitCommandEncoder 2D copy method is not properly declared in
the headers, causing compiler warnings. Use a loop with the simpler
1D copy method instead, which is already used elsewhere in the codebase.

Assisted-by: llama.cpp:local pi
2026-04-29 14:57:48 +03:00
Georgi Gerganov
f9e19a1f6e pi: add rule to not force push branches unless asked
Assisted-by: llama.cpp:local pi
2026-04-29 14:37:13 +03:00
Georgi Gerganov
c3a54d6253 ggml-metal: implement async 2D tensor copy functions
Add ggml_metal_set_tensor_2d_async and ggml_metal_get_tensor_2d_async
functions to the Metal backend, mirroring the CUDA implementation.

These functions use MTLBlitCommandEncoder's 2D copy API to efficiently
copy tensor data with different strides between host and device memory,
enabling row-strided transfers without requiring contiguous layouts.

Assisted-by: llama.cpp:local pi
2026-04-29 14:32:25 +03:00
4 changed files with 103 additions and 2 deletions

View File

@@ -17,6 +17,7 @@ Pull requests (PRs):
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
- Always create the pull requests in draft mode
- Do NOT force push branches unless explicitly asked to do so
Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag

View File

@@ -21,6 +21,8 @@ void ggml_metal_synchronize(ggml_metal_t ctx);
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void ggml_metal_set_tensor_2d_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void ggml_metal_get_tensor_2d_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf);

View File

@@ -392,6 +392,90 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
}
}
void ggml_metal_set_tensor_2d_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
@autoreleasepool {
// wrap the source data into a Metal buffer
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
length:stride_data * n_copies
options:MTLResourceStorageModeShared];
GGML_ASSERT(buf_src);
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
if (bid_dst.metal == nil) {
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
}
bid_dst.offs += offset;
// queue the copy operation into the queue of the Metal context
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
for (size_t i = 0; i < n_copies; i++) {
[encoder copyFromBuffer:buf_src
sourceOffset:i * stride_data
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs + i * stride_tensor
size:size];
}
[encoder endEncoding];
[cmd_buf commit];
[buf_src release];
[ctx->cmd_bufs_ext addObject:cmd_buf];
ctx->cmd_buf_last = cmd_buf;
[cmd_buf retain];
}
}
void ggml_metal_get_tensor_2d_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
@autoreleasepool {
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
length:stride_data * n_copies
options:MTLResourceStorageModeShared
deallocator:nil];
GGML_ASSERT(buf_dst);
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
if (bid_src.metal == nil) {
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
}
bid_src.offs += offset;
// queue the copy operation into the queue of the Metal context
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
for (size_t i = 0; i < n_copies; i++) {
[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs + i * stride_tensor
toBuffer:buf_dst
destinationOffset:i * stride_data
size:size];
}
[encoder endEncoding];
[cmd_buf commit];
[buf_dst release];
[ctx->cmd_bufs_ext addObject:cmd_buf];
ctx->cmd_buf_last = cmd_buf;
[cmd_buf retain];
}
}
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@autoreleasepool {
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src);

View File

@@ -507,6 +507,20 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg
ggml_metal_get_tensor_async(ctx, tensor, data, offset, size);
}
static void ggml_backend_metal_set_tensor_2d_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
ggml_metal_set_tensor_2d_async(ctx, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
static void ggml_backend_metal_get_tensor_2d_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
ggml_metal_get_tensor_2d_async(ctx, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) {
return false;
@@ -567,8 +581,8 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .free = */ ggml_backend_metal_free,
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ ggml_backend_metal_set_tensor_2d_async,
/* .set_tensor_2d_async = */ ggml_backend_metal_get_tensor_2d_async,
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL,