state

2026-05-05 08:34:21 +00:00 · 2026-04-09 13:00:31 +02:00 · 2026-04-09 12:51:43 +02:00 · 2026-04-09 12:39:51 +02:00 · 2026-04-09 12:36:15 +02:00 · 2026-04-09 12:32:08 +02:00
4 changed files with 242 additions and 203 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -36,8 +36,26 @@ env:
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

 jobs:
-  macOS-arm64:
-    runs-on: macos-14
+  macOS-cpu:
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64'
+            arch: 'arm64'
+            os: macos-14
+            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+          - build: 'arm64-kleidiai'
+            arch: 'arm64'
+            os: macos-14
+            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
+          - build: 'x64'
+            arch: 'x64'
+            os: macos-15-intel
+            # Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
+            # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+            defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
+
+    runs-on: ${{ matrix.os }}

    steps:
      - name: Clone
@@ -49,7 +67,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-arm64
+          key: macOS-latest-${{ matrix.arch }}
          evict-old-files: 1d

      - name: Build
@@ -57,13 +75,11 @@ jobs:
        run: |
          sysctl -a
          cmake -B build \
+            ${{ matrix.defines }} \
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -75,61 +91,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
-          name: llama-bin-macos-arm64.tar.gz
-
-  macOS-x64:
-    runs-on: macos-15-intel
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-x64
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_INSTALL_RPATH='@loader_path' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
-          name: llama-bin-macos-x64.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
+          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
    strategy:
@@ -1003,8 +971,7 @@ jobs:
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
-      - macOS-arm64
-      - macOS-x64
+      - macOS-cpu
      - ios-xcode-build
      - openEuler-cann

@@ -1079,6 +1046,7 @@ jobs:

            **macOS/iOS:**
            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
+            - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)

--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1157,19 +1157,6 @@ struct ggml_tensor_extra_gpu {
 #define USE_CUDA_GRAPH
 #endif

-struct ggml_cuda_graph_node_properties {
-    void * node_data;
-    ggml_op node_op;
-    enum ggml_type node_type;
-    int32_t flags;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
-    void * src_data[GGML_MAX_SRC];
-    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-};
-
-static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
-
 struct ggml_cuda_graph {
 #ifdef USE_CUDA_GRAPH
    ~ggml_cuda_graph() {
@@ -1186,13 +1173,7 @@ struct ggml_cuda_graph {
    std::vector<cudaGraphNode_t> nodes;
    bool disable_due_to_gpu_arch = false;
    bool warmup_complete = false;
-    std::vector<ggml_cuda_graph_node_properties> props;
-
-    // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
-    // they properties also have to match in order to be able to safely reuse a CUDA graph
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18583
-    // ref: https://github.com/ggml-org/llama.cpp/pull/19165
-    std::vector<ggml_cuda_graph_node_properties> extra;
+    std::vector<ggml_tensor> nodes_copy;

    bool is_enabled() const {
        static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -82,7 +82,6 @@
 #include <cstdlib>
 #include <string>
 #include <vector>
-#include <unordered_set>

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

@@ -2969,74 +2968,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
    return use_cuda_graph;
 }

-static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
-    memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
-    props->node_data = node->data;
-    props->node_op = node->op;
-    props->node_type = node->type;
-    props->flags = node->flags;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        props->ne[i] = node->ne[i];
-        props->nb[i] = node->nb[i];
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (!node->src[i]) {
-            continue;
-        }
-
-        props->src_data[i] = node->src[i]->data;
-    }
-    memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
-}
-
-static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
-    if (node->data != props->node_data && node->op != GGML_OP_VIEW) {
-        return false;
-    }
-
-    if (node->op != props->node_op) {
-        return false;
-    }
-
-    if (node->type != props->node_type) {
-        return false;
-    }
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != props->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != props->nb[i]) {
-            return false;
-        }
-    }
-
-    if (node->op != GGML_OP_VIEW) {
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            if (!node->src[i]) {
-                if (props->src_data[i] != nullptr) {
-                    return false;
-                }
-                continue;
-            }
-
-            if (node->src[i]->data != props->src_data[i]) {
-                return false;
-            }
-        }
-    }
-
-    if (memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-
-    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
-        return false;
-    }
-
-    return true;
-}
-
 static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
    return cgraph->nodes[0];
 }
@@ -3048,52 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
    ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);

    // Check if the graph size has changed
-    if (graph->props.size() != (size_t)cgraph->n_nodes) {
+    if ((int)graph->nodes_copy.size() != cgraph->n_nodes) {
        res = true;
-        graph->props.resize(cgraph->n_nodes);
+        graph->nodes_copy.resize(cgraph->n_nodes);
    }

-    // Loop over nodes in GGML graph to determine if CUDA graph update is required
-    // and store properties to allow this comparison for the next token
-    std::unordered_set<ggml_tensor *> seen_node;
-    std::vector<ggml_tensor *> srcs_extra;
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        bool props_match = true;
-
-        seen_node.insert(cgraph->nodes[i]);
-
        if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
-        }
-        if (!props_match) {
-            res = true;
-        }
-        ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
-
-        for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
-            ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
-            if (src && seen_node.find(src) == seen_node.end()) {
-                srcs_extra.push_back(src);
+            if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
+                res = true;
            }
        }
-    }
-
-    if (graph->extra.size() != (size_t) srcs_extra.size()) {
-        res = true;
-        graph->extra.resize(srcs_extra.size());
-    }
-
-    for (size_t i = 0; i < srcs_extra.size(); ++i) {
-        bool props_match = true;
-
-        if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]);
-        }
-
-        if (!props_match) {
-            res = true;
-        }
-        ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
+        memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor));
    }

    return res;
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -40,6 +40,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #include <future>
 #include <thread>

+
 #if defined(_MSC_VER)
 # define NOMINMAX 1
 # include <windows.h>
@@ -580,6 +581,22 @@ static constexpr std::initializer_list<std::array<int, 3>> rms_norm_mul_rope_vie
 };


+struct vk_peer_copy_buf {
+    void *     host_ptr  = nullptr;
+    size_t     host_size = 0;
+    vk_buffer  src_buf;              // host_ptr imported into source device
+    vk_buffer  dst_buf;              // host_ptr imported into dest device
+};
+
+struct vk_peer_staging {
+    std::vector<vk_peer_copy_buf> bufs;  // per-copy buffer pool
+    size_t buf_idx = 0;                  // reset between iterations
+
+    // timeline semaphore on source device for hop1 synchronization
+    vk::Semaphore tl_sem;
+    uint64_t tl_sem_value = 0;
+};
+
 struct vk_device_struct {
    std::recursive_mutex mutex;

@@ -857,6 +874,8 @@ struct vk_device_struct {
    vk::Fence fence;
    vk_buffer sync_staging;

+    std::map<vk_device_struct *, vk_peer_staging> peer_staging;
+
    ggml_backend_buffer_type buffer_type;

    bool disable_fusion;
@@ -871,6 +890,24 @@ struct vk_device_struct {

        device.destroyFence(fence);

+        for (auto& [peer, staging] : peer_staging) {
+            if (staging.tl_sem) {
+                device.destroySemaphore(staging.tl_sem);
+            }
+            for (auto& buf : staging.bufs) {
+                buf.src_buf.reset();
+                buf.dst_buf.reset();
+                if (buf.host_ptr) {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+                    _aligned_free(buf.host_ptr);
+#else
+                    free(buf.host_ptr);
+#endif
+                }
+            }
+        }
+        peer_staging.clear();
+
        ggml_vk_destroy_buffer(sync_staging);

        compute_queue.cmd_pool.destroy(device);
@@ -1651,7 +1688,6 @@ typedef std::weak_ptr<vk_context_struct> vk_context_ref;

 struct ggml_vk_garbage_collector {
    std::vector<vk_semaphore> tl_semaphores;
-    std::vector<vk_semaphore> semaphores;
    std::vector<vk::Event> events;
    std::vector<vk_context> contexts;
 };
@@ -2493,15 +2529,6 @@ static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
    return result;
 }

-static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
-    VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
-    vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
-    vk::SemaphoreCreateInfo ci{};
-    ci.setPNext(&tci);
-    vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
-    ctx->gc.semaphores.push_back({ semaphore, 0 });
-    return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
-}

 static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
    VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
@@ -13331,10 +13358,14 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
        ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
    }

-    for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
-        ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
+    // Reset device-level command pools used by cross-device hop1 temporary contexts
+    if (!ctx->device->peer_staging.empty()) {
+        ggml_vk_queue_command_pools_cleanup(ctx->device);
+    }
+
+    for (auto& [peer, staging] : ctx->device->peer_staging) {
+        staging.buf_idx = 0;
    }
-    ctx->gc.semaphores.clear();

    for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
        ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
@@ -13757,6 +13788,112 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
    }
 }

+static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
+
+static void ggml_vk_free_peer_copy_buf(vk_peer_copy_buf& buf) {
+    buf.src_buf.reset();
+    buf.dst_buf.reset();
+    if (buf.host_ptr) {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+        _aligned_free(buf.host_ptr);
+#else
+        free(buf.host_ptr);
+#endif
+        buf.host_ptr = nullptr;
+        buf.host_size = 0;
+    }
+}
+
+static bool ggml_vk_alloc_peer_copy_buf(vk_device& src_dev, vk_device& dst_dev,
+                                         vk_peer_copy_buf& buf, size_t required_size) {
+    uint64_t alignment = std::max(src_dev->min_imported_host_pointer_alignment,
+                                  dst_dev->min_imported_host_pointer_alignment);
+    if (alignment == 0) {
+        alignment = 4096;
+    }
+
+    size_t alloc_size = CEIL_DIV(required_size, alignment) * alignment;
+
+    void * host_ptr = nullptr;
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    host_ptr = _aligned_malloc(alloc_size, (size_t)alignment);
+#else
+    if (posix_memalign(&host_ptr, (size_t)alignment, alloc_size) != 0) {
+        host_ptr = nullptr;
+    }
+#endif
+    if (!host_ptr) {
+        return false;
+    }
+
+    vk_buffer src_buf = ggml_vk_buffer_from_host_ptr(src_dev, host_ptr, alloc_size);
+    if (!src_buf) {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+        _aligned_free(host_ptr);
+#else
+        free(host_ptr);
+#endif
+        return false;
+    }
+
+    vk_buffer dst_buf = ggml_vk_buffer_from_host_ptr(dst_dev, host_ptr, alloc_size);
+    if (!dst_buf) {
+        src_buf.reset();
+#if defined(_MSC_VER) || defined(__MINGW32__)
+        _aligned_free(host_ptr);
+#else
+        free(host_ptr);
+#endif
+        return false;
+    }
+
+    buf = { host_ptr, alloc_size, std::move(src_buf), std::move(dst_buf) };
+    return true;
+}
+
+// Returns a per-copy buffer from the pool, or null on failure.
+// Ensures the peer_staging entry and its semaphores exist.
+static vk_peer_copy_buf * ggml_vk_get_peer_copy_buf(vk_device& src_dev, vk_device& dst_dev,
+                                                      size_t required_size) {
+    if (!src_dev->external_memory_host || !dst_dev->external_memory_host) {
+        return nullptr;
+    }
+
+    auto& staging = src_dev->peer_staging[dst_dev.get()];
+
+    // Lazy-init timeline semaphore on first use
+    if (!staging.tl_sem) {
+        vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
+        vk::SemaphoreCreateInfo sci{};
+        sci.setPNext(&tci);
+        staging.tl_sem = src_dev->device.createSemaphore(sci);
+    }
+
+    // Get or create a buffer from the pool
+    if (staging.buf_idx < staging.bufs.size()) {
+        auto& buf = staging.bufs[staging.buf_idx];
+        // Resize if too small
+        if (buf.host_size < required_size) {
+            ggml_vk_free_peer_copy_buf(buf);
+            if (!ggml_vk_alloc_peer_copy_buf(src_dev, dst_dev, buf, required_size)) {
+                return nullptr;
+            }
+        }
+        staging.buf_idx++;
+        return &buf;
+    }
+
+    // Pool exhausted — allocate a new entry
+    staging.bufs.emplace_back();
+    auto& buf = staging.bufs.back();
+    if (!ggml_vk_alloc_peer_copy_buf(src_dev, dst_dev, buf, required_size)) {
+        staging.bufs.pop_back();
+        return nullptr;
+    }
+    staging.buf_idx++;
+    return &buf;
+}
+
 static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
    VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
@@ -13776,9 +13913,66 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
    if (ggml_backend_buffer_is_vk(src->buffer)) {
        ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;

-        // Async copy only works within the same device
        if (src_buf_ctx->dev_buffer->device != dst_buf->device) {
-            return false;
+            // Cross-device copy via per-copy staging buffer
+            ggml_backend_vk_context * src_ctx = (ggml_backend_vk_context *)backend_src->context;
+            vk_device src_dev = src_ctx->device;
+            vk_device dst_dev = ctx->device;
+
+            size_t nbytes = ggml_nbytes(src);
+            vk_buffer src_vk_buf = src_buf_ctx->dev_buffer;
+            size_t src_offset = vk_tensor_offset(src) + src->view_offs;
+            size_t dst_offset = vk_tensor_offset(dst) + dst->view_offs;
+
+            vk_peer_copy_buf * copy_buf = ggml_vk_get_peer_copy_buf(src_dev, dst_dev, nbytes);
+            if (!copy_buf) {
+                return false;
+            }
+
+            auto& staging = src_dev->peer_staging[dst_dev.get()];
+
+            // HOP 1: src VRAM → staging (on source compute queue)
+            // Implicit queue submission ordering guarantees this
+            // executes after all prior compute work.
+            vk_context hop1_ctx;
+            {
+                std::lock_guard<std::recursive_mutex> guard(src_dev->mutex);
+                hop1_ctx = ggml_vk_create_temporary_context(src_dev->compute_queue.cmd_pool);
+                ggml_vk_ctx_begin(src_dev, hop1_ctx);
+
+                VkBufferCopy bc{ src_offset, 0, nbytes };
+                vkCmdCopyBuffer(hop1_ctx->s->buffer->buf,
+                                (VkBuffer)src_vk_buf->buffer,
+                                (VkBuffer)copy_buf->src_buf->buffer,
+                                1, &bc);
+
+                ggml_vk_ctx_end(hop1_ctx);
+            }
+
+            // Submit hop1, signal timeline semaphore, CPU wait
+            staging.tl_sem_value++;
+            hop1_ctx->seqs.back().back().signal_semaphores.push_back(
+                { staging.tl_sem, staging.tl_sem_value });
+            ggml_vk_submit(hop1_ctx, {});
+            src_ctx->submit_pending = true;
+
+            vk::SemaphoreWaitInfo swi{
+                vk::SemaphoreWaitFlags{},
+                1, &staging.tl_sem, &staging.tl_sem_value
+            };
+            VK_CHECK(src_dev->device.waitSemaphores(swi, UINT64_MAX),
+                     "cross_device_hop1 waitSemaphores");
+
+            // HOP 2: staging → dst VRAM (on dest device)
+            vk_context dst_compute_ctx = ggml_vk_get_compute_ctx(ctx);
+
+            VkBufferCopy bc2{ 0, dst_offset, nbytes };
+            vkCmdCopyBuffer(dst_compute_ctx->s->buffer->buf,
+                            (VkBuffer)copy_buf->dst_buf->buffer,
+                            (VkBuffer)dst_buf->buffer,
+                            1, &bc2);
+
+            return true;
        }

        vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
@@ -13815,7 +14009,6 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
                                          src->data, ggml_nbytes(src));
    }

-    GGML_UNUSED(backend_src);
    return false;
 }
Author	SHA1	Message	Date
Ruben Ortlam	4cabbe36e0	state	2026-04-09 13:00:31 +02:00
Ruben Ortlam	9f001cae27	state	2026-04-09 12:51:43 +02:00
Ruben Ortlam	88335c0490	state	2026-04-09 12:39:51 +02:00
Ruben Ortlam	204023c897	state	2026-04-09 12:36:15 +02:00
Ruben Ortlam	d88d722fc1	state	2026-04-09 12:32:08 +02:00
Ruben Ortlam	96d9516329	state	2026-04-09 12:25:27 +02:00
Ruben Ortlam	8a108eddb4	state	2026-04-09 12:05:15 +02:00
Ruben Ortlam	47dde34e00	state	2026-04-09 11:58:46 +02:00
Ruben Ortlam	8d0e158076	state	2026-04-09 11:51:39 +02:00
Ruben Ortlam	aade0f81dd	state	2026-04-09 11:42:50 +02:00
Ruben Ortlam	700270239d	state	2026-04-09 11:24:21 +02:00
Ruben Ortlam	ddaafa3dc1	state	2026-04-09 11:11:17 +02:00
Ruben Ortlam	e5e0be0add	state	2026-04-09 11:00:36 +02:00
Ruben Ortlam	3c4eae7dc9	state	2026-04-09 07:50:05 +02:00
Ruben Ortlam	7e2799c8c9	state	2026-04-09 07:40:02 +02:00
Ruben Ortlam	cd0722594a	state	2026-04-09 07:25:33 +02:00
Martin Klacer	5c4aae66e1	devops: kleidiai: provide KleidiAI-Enabled ARM Release Artifact (#21259 ) * Unified macOS release setup with strategy-matrix block * Added KleidiAI arm64 macOS release definition Change-Id: I05520889ffc646488a178d06817a17f29274465a Signed-off-by: Martin Klacer <martin.klacer@arm.com>	2026-04-08 13:06:12 +08:00
Aman Gupta	c5ce4bc227	CUDA: make cuda graphs props check faster (#21472 ) * CUDA: compute fast hash instead of expensive props check * use seen node * use memcp	2026-04-08 09:05:51 +08:00