Compare commits

...

18 Commits

Author SHA1 Message Date
Ruben Ortlam
4cabbe36e0 state 2026-04-09 13:00:31 +02:00
Ruben Ortlam
9f001cae27 state 2026-04-09 12:51:43 +02:00
Ruben Ortlam
88335c0490 state 2026-04-09 12:39:51 +02:00
Ruben Ortlam
204023c897 state 2026-04-09 12:36:15 +02:00
Ruben Ortlam
d88d722fc1 state 2026-04-09 12:32:08 +02:00
Ruben Ortlam
96d9516329 state 2026-04-09 12:25:27 +02:00
Ruben Ortlam
8a108eddb4 state 2026-04-09 12:05:15 +02:00
Ruben Ortlam
47dde34e00 state 2026-04-09 11:58:46 +02:00
Ruben Ortlam
8d0e158076 state 2026-04-09 11:51:39 +02:00
Ruben Ortlam
aade0f81dd state 2026-04-09 11:42:50 +02:00
Ruben Ortlam
700270239d state 2026-04-09 11:24:21 +02:00
Ruben Ortlam
ddaafa3dc1 state 2026-04-09 11:11:17 +02:00
Ruben Ortlam
e5e0be0add state 2026-04-09 11:00:36 +02:00
Ruben Ortlam
3c4eae7dc9 state 2026-04-09 07:50:05 +02:00
Ruben Ortlam
7e2799c8c9 state 2026-04-09 07:40:02 +02:00
Ruben Ortlam
cd0722594a state 2026-04-09 07:25:33 +02:00
Martin Klacer
5c4aae66e1 devops: kleidiai: provide KleidiAI-Enabled ARM Release Artifact (#21259)
* Unified macOS release setup with strategy-matrix block
 * Added KleidiAI arm64 macOS release definition


Change-Id: I05520889ffc646488a178d06817a17f29274465a

Signed-off-by: Martin Klacer <martin.klacer@arm.com>
2026-04-08 13:06:12 +08:00
Aman Gupta
c5ce4bc227 CUDA: make cuda graphs props check faster (#21472)
* CUDA: compute fast hash instead of expensive props check

* use seen node

* use memcp
2026-04-08 09:05:51 +08:00
4 changed files with 242 additions and 203 deletions

View File

@@ -36,8 +36,26 @@ env:
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
jobs:
macOS-arm64:
runs-on: macos-14
macOS-cpu:
strategy:
matrix:
include:
- build: 'arm64'
arch: 'arm64'
os: macos-14
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
- build: 'arm64-kleidiai'
arch: 'arm64'
os: macos-14
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
- build: 'x64'
arch: 'x64'
os: macos-15-intel
# Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
runs-on: ${{ matrix.os }}
steps:
- name: Clone
@@ -49,7 +67,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: macOS-latest-arm64
key: macOS-latest-${{ matrix.arch }}
evict-old-files: 1d
- name: Build
@@ -57,13 +75,11 @@ jobs:
run: |
sysctl -a
cmake -B build \
${{ matrix.defines }} \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -75,61 +91,13 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
name: llama-bin-macos-arm64.tar.gz
macOS-x64:
runs-on: macos-15-intel
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: macOS-latest-x64
evict-old-files: 1d
- name: Build
id: cmake_build
run: |
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
name: llama-bin-macos-x64.tar.gz
path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
name: llama-bin-macos-${{ matrix.build }}.tar.gz
ubuntu-cpu:
strategy:
@@ -1003,8 +971,7 @@ jobs:
- ubuntu-cpu
- ubuntu-vulkan
- ubuntu-24-openvino
- macOS-arm64
- macOS-x64
- macOS-cpu
- ios-xcode-build
- openEuler-cann
@@ -1079,6 +1046,7 @@ jobs:
**macOS/iOS:**
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
- [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)

View File

@@ -1157,19 +1157,6 @@ struct ggml_tensor_extra_gpu {
#define USE_CUDA_GRAPH
#endif
struct ggml_cuda_graph_node_properties {
void * node_data;
ggml_op node_op;
enum ggml_type node_type;
int32_t flags;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
void * src_data[GGML_MAX_SRC];
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
};
static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
struct ggml_cuda_graph {
#ifdef USE_CUDA_GRAPH
~ggml_cuda_graph() {
@@ -1186,13 +1173,7 @@ struct ggml_cuda_graph {
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
bool warmup_complete = false;
std::vector<ggml_cuda_graph_node_properties> props;
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
// they properties also have to match in order to be able to safely reuse a CUDA graph
// ref: https://github.com/ggml-org/llama.cpp/pull/18583
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
std::vector<ggml_cuda_graph_node_properties> extra;
std::vector<ggml_tensor> nodes_copy;
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

View File

@@ -82,7 +82,6 @@
#include <cstdlib>
#include <string>
#include <vector>
#include <unordered_set>
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
@@ -2969,74 +2968,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
return use_cuda_graph;
}
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
props->node_data = node->data;
props->node_op = node->op;
props->node_type = node->type;
props->flags = node->flags;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
props->ne[i] = node->ne[i];
props->nb[i] = node->nb[i];
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (!node->src[i]) {
continue;
}
props->src_data[i] = node->src[i]->data;
}
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
}
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
if (node->data != props->node_data && node->op != GGML_OP_VIEW) {
return false;
}
if (node->op != props->node_op) {
return false;
}
if (node->type != props->node_type) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != props->ne[i]) {
return false;
}
if (node->nb[i] != props->nb[i]) {
return false;
}
}
if (node->op != GGML_OP_VIEW) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (!node->src[i]) {
if (props->src_data[i] != nullptr) {
return false;
}
continue;
}
if (node->src[i]->data != props->src_data[i]) {
return false;
}
}
}
if (memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
return false;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
return false;
}
return true;
}
static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
return cgraph->nodes[0];
}
@@ -3048,52 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
// Check if the graph size has changed
if (graph->props.size() != (size_t)cgraph->n_nodes) {
if ((int)graph->nodes_copy.size() != cgraph->n_nodes) {
res = true;
graph->props.resize(cgraph->n_nodes);
graph->nodes_copy.resize(cgraph->n_nodes);
}
// Loop over nodes in GGML graph to determine if CUDA graph update is required
// and store properties to allow this comparison for the next token
std::unordered_set<ggml_tensor *> seen_node;
std::vector<ggml_tensor *> srcs_extra;
for (int i = 0; i < cgraph->n_nodes; i++) {
bool props_match = true;
seen_node.insert(cgraph->nodes[i]);
if (!res) {
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
}
if (!props_match) {
res = true;
}
ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
if (src && seen_node.find(src) == seen_node.end()) {
srcs_extra.push_back(src);
if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
res = true;
}
}
}
if (graph->extra.size() != (size_t) srcs_extra.size()) {
res = true;
graph->extra.resize(srcs_extra.size());
}
for (size_t i = 0; i < srcs_extra.size(); ++i) {
bool props_match = true;
if (!res) {
props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]);
}
if (!props_match) {
res = true;
}
ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor));
}
return res;

View File

@@ -40,6 +40,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
#include <future>
#include <thread>
#if defined(_MSC_VER)
# define NOMINMAX 1
# include <windows.h>
@@ -580,6 +581,22 @@ static constexpr std::initializer_list<std::array<int, 3>> rms_norm_mul_rope_vie
};
struct vk_peer_copy_buf {
void * host_ptr = nullptr;
size_t host_size = 0;
vk_buffer src_buf; // host_ptr imported into source device
vk_buffer dst_buf; // host_ptr imported into dest device
};
struct vk_peer_staging {
std::vector<vk_peer_copy_buf> bufs; // per-copy buffer pool
size_t buf_idx = 0; // reset between iterations
// timeline semaphore on source device for hop1 synchronization
vk::Semaphore tl_sem;
uint64_t tl_sem_value = 0;
};
struct vk_device_struct {
std::recursive_mutex mutex;
@@ -857,6 +874,8 @@ struct vk_device_struct {
vk::Fence fence;
vk_buffer sync_staging;
std::map<vk_device_struct *, vk_peer_staging> peer_staging;
ggml_backend_buffer_type buffer_type;
bool disable_fusion;
@@ -871,6 +890,24 @@ struct vk_device_struct {
device.destroyFence(fence);
for (auto& [peer, staging] : peer_staging) {
if (staging.tl_sem) {
device.destroySemaphore(staging.tl_sem);
}
for (auto& buf : staging.bufs) {
buf.src_buf.reset();
buf.dst_buf.reset();
if (buf.host_ptr) {
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(buf.host_ptr);
#else
free(buf.host_ptr);
#endif
}
}
}
peer_staging.clear();
ggml_vk_destroy_buffer(sync_staging);
compute_queue.cmd_pool.destroy(device);
@@ -1651,7 +1688,6 @@ typedef std::weak_ptr<vk_context_struct> vk_context_ref;
struct ggml_vk_garbage_collector {
std::vector<vk_semaphore> tl_semaphores;
std::vector<vk_semaphore> semaphores;
std::vector<vk::Event> events;
std::vector<vk_context> contexts;
};
@@ -2493,15 +2529,6 @@ static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
return result;
}
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
vk::SemaphoreCreateInfo ci{};
ci.setPNext(&tci);
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
ctx->gc.semaphores.push_back({ semaphore, 0 });
return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
}
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
@@ -13331,10 +13358,14 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
}
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
// Reset device-level command pools used by cross-device hop1 temporary contexts
if (!ctx->device->peer_staging.empty()) {
ggml_vk_queue_command_pools_cleanup(ctx->device);
}
for (auto& [peer, staging] : ctx->device->peer_staging) {
staging.buf_idx = 0;
}
ctx->gc.semaphores.clear();
for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
@@ -13757,6 +13788,112 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
}
}
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
static void ggml_vk_free_peer_copy_buf(vk_peer_copy_buf& buf) {
buf.src_buf.reset();
buf.dst_buf.reset();
if (buf.host_ptr) {
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(buf.host_ptr);
#else
free(buf.host_ptr);
#endif
buf.host_ptr = nullptr;
buf.host_size = 0;
}
}
static bool ggml_vk_alloc_peer_copy_buf(vk_device& src_dev, vk_device& dst_dev,
vk_peer_copy_buf& buf, size_t required_size) {
uint64_t alignment = std::max(src_dev->min_imported_host_pointer_alignment,
dst_dev->min_imported_host_pointer_alignment);
if (alignment == 0) {
alignment = 4096;
}
size_t alloc_size = CEIL_DIV(required_size, alignment) * alignment;
void * host_ptr = nullptr;
#if defined(_MSC_VER) || defined(__MINGW32__)
host_ptr = _aligned_malloc(alloc_size, (size_t)alignment);
#else
if (posix_memalign(&host_ptr, (size_t)alignment, alloc_size) != 0) {
host_ptr = nullptr;
}
#endif
if (!host_ptr) {
return false;
}
vk_buffer src_buf = ggml_vk_buffer_from_host_ptr(src_dev, host_ptr, alloc_size);
if (!src_buf) {
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(host_ptr);
#else
free(host_ptr);
#endif
return false;
}
vk_buffer dst_buf = ggml_vk_buffer_from_host_ptr(dst_dev, host_ptr, alloc_size);
if (!dst_buf) {
src_buf.reset();
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(host_ptr);
#else
free(host_ptr);
#endif
return false;
}
buf = { host_ptr, alloc_size, std::move(src_buf), std::move(dst_buf) };
return true;
}
// Returns a per-copy buffer from the pool, or null on failure.
// Ensures the peer_staging entry and its semaphores exist.
static vk_peer_copy_buf * ggml_vk_get_peer_copy_buf(vk_device& src_dev, vk_device& dst_dev,
size_t required_size) {
if (!src_dev->external_memory_host || !dst_dev->external_memory_host) {
return nullptr;
}
auto& staging = src_dev->peer_staging[dst_dev.get()];
// Lazy-init timeline semaphore on first use
if (!staging.tl_sem) {
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&tci);
staging.tl_sem = src_dev->device.createSemaphore(sci);
}
// Get or create a buffer from the pool
if (staging.buf_idx < staging.bufs.size()) {
auto& buf = staging.bufs[staging.buf_idx];
// Resize if too small
if (buf.host_size < required_size) {
ggml_vk_free_peer_copy_buf(buf);
if (!ggml_vk_alloc_peer_copy_buf(src_dev, dst_dev, buf, required_size)) {
return nullptr;
}
}
staging.buf_idx++;
return &buf;
}
// Pool exhausted — allocate a new entry
staging.bufs.emplace_back();
auto& buf = staging.bufs.back();
if (!ggml_vk_alloc_peer_copy_buf(src_dev, dst_dev, buf, required_size)) {
staging.bufs.pop_back();
return nullptr;
}
staging.buf_idx++;
return &buf;
}
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
@@ -13776,9 +13913,66 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
if (ggml_backend_buffer_is_vk(src->buffer)) {
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
// Async copy only works within the same device
if (src_buf_ctx->dev_buffer->device != dst_buf->device) {
return false;
// Cross-device copy via per-copy staging buffer
ggml_backend_vk_context * src_ctx = (ggml_backend_vk_context *)backend_src->context;
vk_device src_dev = src_ctx->device;
vk_device dst_dev = ctx->device;
size_t nbytes = ggml_nbytes(src);
vk_buffer src_vk_buf = src_buf_ctx->dev_buffer;
size_t src_offset = vk_tensor_offset(src) + src->view_offs;
size_t dst_offset = vk_tensor_offset(dst) + dst->view_offs;
vk_peer_copy_buf * copy_buf = ggml_vk_get_peer_copy_buf(src_dev, dst_dev, nbytes);
if (!copy_buf) {
return false;
}
auto& staging = src_dev->peer_staging[dst_dev.get()];
// HOP 1: src VRAM → staging (on source compute queue)
// Implicit queue submission ordering guarantees this
// executes after all prior compute work.
vk_context hop1_ctx;
{
std::lock_guard<std::recursive_mutex> guard(src_dev->mutex);
hop1_ctx = ggml_vk_create_temporary_context(src_dev->compute_queue.cmd_pool);
ggml_vk_ctx_begin(src_dev, hop1_ctx);
VkBufferCopy bc{ src_offset, 0, nbytes };
vkCmdCopyBuffer(hop1_ctx->s->buffer->buf,
(VkBuffer)src_vk_buf->buffer,
(VkBuffer)copy_buf->src_buf->buffer,
1, &bc);
ggml_vk_ctx_end(hop1_ctx);
}
// Submit hop1, signal timeline semaphore, CPU wait
staging.tl_sem_value++;
hop1_ctx->seqs.back().back().signal_semaphores.push_back(
{ staging.tl_sem, staging.tl_sem_value });
ggml_vk_submit(hop1_ctx, {});
src_ctx->submit_pending = true;
vk::SemaphoreWaitInfo swi{
vk::SemaphoreWaitFlags{},
1, &staging.tl_sem, &staging.tl_sem_value
};
VK_CHECK(src_dev->device.waitSemaphores(swi, UINT64_MAX),
"cross_device_hop1 waitSemaphores");
// HOP 2: staging → dst VRAM (on dest device)
vk_context dst_compute_ctx = ggml_vk_get_compute_ctx(ctx);
VkBufferCopy bc2{ 0, dst_offset, nbytes };
vkCmdCopyBuffer(dst_compute_ctx->s->buffer->buf,
(VkBuffer)copy_buf->dst_buf->buffer,
(VkBuffer)dst_buf->buffer,
1, &bc2);
return true;
}
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
@@ -13815,7 +14009,6 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
src->data, ggml_nbytes(src));
}
GGML_UNUSED(backend_src);
return false;
}