diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b005b4901d..55f751c98c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -608,6 +608,7 @@ struct vk_device_struct { uint64_t suballocation_block_size; uint64_t min_imported_host_pointer_alignment; bool external_memory_host {}; + bool external_memory_dma_buf {}; bool external_semaphore_fd {}; bool fp16; bool bf16; @@ -4977,6 +4978,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->memory_priority = true; } else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) { device->external_memory_host = true; + } else if (strcmp("VK_EXT_external_memory_dma_buf", properties.extensionName) == 0) { + device->external_memory_dma_buf = true; } else if (strcmp("VK_KHR_external_semaphore_fd", properties.extensionName) == 0) { device->external_semaphore_fd = true; #if defined(VK_EXT_shader_64bit_indexing) @@ -5278,6 +5281,11 @@ static vk_device ggml_vk_get_device(size_t idx) { device_extensions.push_back("VK_EXT_external_memory_host"); } + if (device->external_memory_dma_buf) { + device_extensions.push_back("VK_KHR_external_memory_fd"); + device_extensions.push_back("VK_EXT_external_memory_dma_buf"); + } + if (device->external_semaphore_fd) { device_extensions.push_back("VK_KHR_external_semaphore_fd"); } @@ -12820,16 +12828,6 @@ struct vk_shared_staging { } }; -// Helper: run a benchmark and print results -static void vk_bench_print(const char * name, std::vector & times, size_t size) { - std::sort(times.begin(), times.end()); - double median = times[times.size() / 2]; - double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0); - std::cerr << " " << std::left << std::setw(22) << name << " : " - << std::fixed << std::setprecision(3) << median << " ms " - << std::setprecision(2) << bw << " GB/s" << std::endl; -} - // Results stored per (method, size) for table output struct vk_copy_result { std::string method; @@ -12891,11 +12889,6 @@ static void ggml_vk_bench_pair( results[method].push_back({ method, median, bw }); }; - // Helper to record a skipped size (sentinel: negative ms) - auto skip = [&](const std::string & method) { - results[method].push_back({ method, -1.0, -1.0 }); - }; - for (size_t size : test_sizes) { // ================================================================= @@ -12935,49 +12928,7 @@ static void ggml_vk_bench_pair( } // ================================================================= - // 2. Diagnostics: individual hop timings - // ================================================================= - { - std::vector times; - for (size_t i = 0; i < num_it + warmup; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - { - std::lock_guard guard(dev0->mutex); - vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(dev0, subctx); - ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size); - ggml_vk_ctx_end(subctx); - ggml_vk_submit(subctx, dev0->fence); - VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "diag hop1"); - dev0->device.resetFences({ dev0->fence }); - } - auto end = std::chrono::high_resolution_clock::now(); - if (i >= warmup) times.push_back(std::chrono::duration_cast(end - begin).count() / 1000.0); - } - record("hop1_only", size, times); - } - { - std::vector times; - for (size_t i = 0; i < num_it + warmup; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - { - std::lock_guard guard(dev1->mutex); - vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(dev1, subctx); - ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size); - ggml_vk_ctx_end(subctx); - ggml_vk_submit(subctx, dev1->fence); - VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "diag hop2"); - dev1->device.resetFences({ dev1->fence }); - } - auto end = std::chrono::high_resolution_clock::now(); - if (i >= warmup) times.push_back(std::chrono::duration_cast(end - begin).count() / 1000.0); - } - record("hop2_only", size, times); - } - - // ================================================================= - // 3. Shared staging: single host buffer imported into both devices + // 2. Shared staging: single host buffer imported into both devices // ================================================================= if (has_shared_staging) { vk_shared_staging stg; @@ -13018,30 +12969,21 @@ static void ggml_vk_bench_pair( } // ================================================================= - // 4. Chunked pipeline: split into N chunks, overlap hop1/hop2 - // via full-duplex PCIe. Vary chunk count to find optimum. + // 3. Chunked pipeline: split into 2 chunks, overlap hop1/hop2 + // via full-duplex PCIe. // ================================================================= - if (has_shared_staging) { - for (size_t n_chunks : { 2, 4, 8 }) { - char cname[32]; - snprintf(cname, sizeof(cname), "chunked_%zu", n_chunks); - if (size < n_chunks * 4096) { skip(cname); continue; } + if (has_shared_staging && size >= 2 * 4096) { + constexpr size_t n_chunks = 2; - size_t align = std::max(dev0->min_imported_host_pointer_alignment, - dev1->min_imported_host_pointer_alignment); - size_t chunk_data = size / n_chunks; - size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1); + size_t align = std::max(dev0->min_imported_host_pointer_alignment, + dev1->min_imported_host_pointer_alignment); + size_t chunk_data = size / n_chunks; + size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1); - vk_shared_staging stg; - if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) { - std::cerr << " chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl; - stg.free_resources(); - continue; - } - - // Per-chunk timeline semaphores - std::vector chunk_sems(n_chunks); - std::vector sem_vals(n_chunks, 0); + vk_shared_staging stg; + if (stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) { + vk::Semaphore chunk_sems[n_chunks]; + uint64_t sem_vals[n_chunks] = {}; for (size_t c = 0; c < n_chunks; c++) { vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo sci{}; @@ -13053,7 +12995,6 @@ static void ggml_vk_bench_pair( for (size_t iter = 0; iter < num_it + warmup; iter++) { auto begin = std::chrono::high_resolution_clock::now(); - // Submit all hop1s upfront for (size_t c = 0; c < n_chunks; c++) { size_t off_src = c * chunk_data; size_t off_stg = c * chunk_aligned; @@ -13068,7 +13009,6 @@ static void ggml_vk_bench_pair( ggml_vk_submit(subctx, {}); } - // Per-chunk: CPU wait hop1, submit hop2 for (size_t c = 0; c < n_chunks; c++) { size_t off_dst = c * chunk_data; size_t off_stg = c * chunk_aligned; @@ -13091,17 +13031,17 @@ static void ggml_vk_bench_pair( if (iter >= warmup) times.push_back(std::chrono::duration_cast(end - begin).count() / 1000.0); } - char name[32]; - snprintf(name, sizeof(name), "chunked_%zu", n_chunks); - record(name, size, times); + record("chunked_2", size, times); for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(chunk_sems[c]); - stg.free_resources(); + } else { + std::cerr << " chunked_2 : SKIPPED (import failed)" << std::endl; } + stg.free_resources(); } // ================================================================= - // 5. sync_fd async: fully GPU-synchronised via Linux sync_file + // 4. sync_fd async: fully GPU-synchronised via Linux sync_file // ================================================================= #ifndef _WIN32 if (has_shared_staging && has_syncfd) { @@ -13184,35 +13124,26 @@ static void ggml_vk_bench_pair( } // ================================================================= - // 6. sync_fd chunked: chunked pipeline with GPU-side sync_fd + // 5. sync_fd chunked: 2-chunk pipeline with GPU-side sync_fd // between hops (no CPU waits between chunks) // ================================================================= - if (has_shared_staging && has_syncfd) { - for (size_t n_chunks : { 2, 4, 8 }) { - char scname[48]; - snprintf(scname, sizeof(scname), "syncfd_chunked_%zu", n_chunks); - if (size < n_chunks * 4096) { skip(scname); continue; } + if (has_shared_staging && has_syncfd && size >= 2 * 4096) { + constexpr size_t n_chunks = 2; - size_t align = std::max(dev0->min_imported_host_pointer_alignment, - dev1->min_imported_host_pointer_alignment); - size_t chunk_data = size / n_chunks; - size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1); - - vk_shared_staging stg; - if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) { - std::cerr << " syncfd_chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl; - stg.free_resources(); - continue; - } + size_t align = std::max(dev0->min_imported_host_pointer_alignment, + dev1->min_imported_host_pointer_alignment); + size_t chunk_data = size / n_chunks; + size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1); + vk_shared_staging stg; + if (stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) { std::vector times; bool run_ok = true; for (size_t iter = 0; iter < num_it + warmup && run_ok; iter++) { auto begin = std::chrono::high_resolution_clock::now(); - // Create per-chunk exportable semaphores - std::vector sems_dev0(n_chunks); + vk::Semaphore sems_dev0[n_chunks]; for (size_t c = 0; c < n_chunks; c++) { vk::ExportSemaphoreCreateInfo esci{}; esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd; @@ -13221,7 +13152,6 @@ static void ggml_vk_bench_pair( sems_dev0[c] = dev0->device.createSemaphore(sci); } - // Submit all hop1s with per-chunk signal for (size_t c = 0; c < n_chunks; c++) { size_t off_src = c * chunk_data; size_t off_stg = c * chunk_aligned; @@ -13235,7 +13165,6 @@ static void ggml_vk_bench_pair( ggml_vk_submit(subctx, {}); } - // Export all sync_fds and import on dev1, submit hop2s for (size_t c = 0; c < n_chunks && run_ok; c++) { size_t off_dst = c * chunk_data; size_t off_stg = c * chunk_aligned; @@ -13248,8 +13177,7 @@ static void ggml_vk_bench_pair( gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd; sync_fd = dev0->device.getSemaphoreFdKHR(gi); } catch (vk::SystemError& e) { - char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks); - std::cerr << " " << nm << " : SKIPPED (export: " << e.what() << ")" << std::endl; + std::cerr << " syncfd_chunked_2 : SKIPPED (export: " << e.what() << ")" << std::endl; run_ok = false; break; } @@ -13262,8 +13190,7 @@ static void ggml_vk_bench_pair( ii.fd = sync_fd; dev1->device.importSemaphoreFdKHR(ii); } catch (vk::SystemError& e) { - char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks); - std::cerr << " " << nm << " : SKIPPED (import: " << e.what() << ")" << std::endl; + std::cerr << " syncfd_chunked_2 : SKIPPED (import: " << e.what() << ")" << std::endl; dev1->device.destroySemaphore(sem_dev1); close(sync_fd); run_ok = false; break; @@ -13290,13 +13217,157 @@ static void ggml_vk_bench_pair( if (run_ok && iter >= warmup) times.push_back(std::chrono::duration_cast(end - begin).count() / 1000.0); } - if (run_ok) { - char name[48]; - snprintf(name, sizeof(name), "syncfd_chunked_%zu", n_chunks); - record(name, size, times); - } - stg.free_resources(); + if (run_ok) record("syncfd_chunked_2", size, times); + } else { + std::cerr << " syncfd_chunked_2 : SKIPPED (import failed)" << std::endl; } + stg.free_resources(); + } + + // ================================================================= + // 6. DMA-BUF P2P: import source device memory directly on dest + // device via dma-buf fd — true zero-copy if PCIe P2P works. + // Setup (export/import) is outside the timing loop. + // ================================================================= + if (dev0->external_memory_dma_buf && dev1->external_memory_dma_buf) { + bool setup_ok = true; + + // Create exportable source buffer on dev0 + vk::Buffer exp_buffer{}; + vk::DeviceMemory exp_mem{}; + + vk::ExternalMemoryBufferCreateInfo exp_ext_bci{}; + exp_ext_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eDmaBufEXT; + vk::BufferCreateInfo exp_bci{}; + exp_bci.size = size; + exp_bci.usage = vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; + exp_bci.setPNext(&exp_ext_bci); + + try { + exp_buffer = dev0->device.createBuffer(exp_bci); + vk::MemoryRequirements exp_mem_req = dev0->device.getBufferMemoryRequirements(exp_buffer); + + vk::PhysicalDeviceMemoryProperties mem_props = dev0->physical_device.getMemoryProperties(); + uint32_t exp_mem_type = UINT32_MAX; + for (uint32_t m = 0; m < mem_props.memoryTypeCount; m++) { + if ((exp_mem_req.memoryTypeBits & (1u << m)) && + (mem_props.memoryTypes[m].propertyFlags & vk::MemoryPropertyFlagBits::eDeviceLocal)) { + exp_mem_type = m; + break; + } + } + if (exp_mem_type == UINT32_MAX) { + throw vk::SystemError(vk::make_error_code(vk::Result::eErrorInitializationFailed)); + } + + vk::ExportMemoryAllocateInfo export_ai{}; + export_ai.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eDmaBufEXT; + vk::MemoryAllocateInfo exp_alloc{}; + exp_alloc.allocationSize = exp_mem_req.size; + exp_alloc.memoryTypeIndex = exp_mem_type; + exp_alloc.setPNext(&export_ai); + exp_mem = dev0->device.allocateMemory(exp_alloc); + dev0->device.bindBufferMemory(exp_buffer, exp_mem, 0); + + // Copy data from buf_src into exportable buffer + vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(dev0, subctx); + VkBufferCopy fill_bc{ 0, 0, size }; + vkCmdCopyBuffer(subctx->s->buffer->buf, buf_src->buffer, exp_buffer, 1, &fill_bc); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, dev0->fence); + VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "dmabuf fill"); + dev0->device.resetFences({ dev0->fence }); + } catch (vk::SystemError& e) { + std::cerr << " dmabuf_p2p : SKIPPED (src alloc: " << e.what() << ")" << std::endl; + if (exp_buffer) dev0->device.destroyBuffer(exp_buffer); + if (exp_mem) dev0->device.freeMemory(exp_mem); + setup_ok = false; + } + + // Export fd and import on dev1 — once, outside timing loop + vk::Buffer imported_buffer{}; + vk::DeviceMemory imported_mem{}; + + if (setup_ok) { + try { + vk::MemoryGetFdInfoKHR gi{}; + gi.memory = exp_mem; + gi.handleType = vk::ExternalMemoryHandleTypeFlagBits::eDmaBufEXT; + int dmabuf_fd = dev0->device.getMemoryFdKHR(gi); + + vk::MemoryFdPropertiesKHR fd_props = dev1->device.getMemoryFdPropertiesKHR( + vk::ExternalMemoryHandleTypeFlagBits::eDmaBufEXT, dmabuf_fd); + + if (fd_props.memoryTypeBits == 0) { + close(dmabuf_fd); + throw vk::SystemError(vk::make_error_code(vk::Result::eErrorFormatNotSupported)); + } + + vk::ExternalMemoryBufferCreateInfo imp_ext_bci{}; + imp_ext_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eDmaBufEXT; + vk::BufferCreateInfo imp_bci{}; + imp_bci.size = size; + imp_bci.usage = vk::BufferUsageFlagBits::eTransferSrc; + imp_bci.setPNext(&imp_ext_bci); + imported_buffer = dev1->device.createBuffer(imp_bci); + + vk::MemoryRequirements mem_req = dev1->device.getBufferMemoryRequirements(imported_buffer); + uint32_t mem_type_idx = UINT32_MAX; + for (uint32_t m = 0; m < 32; m++) { + if ((fd_props.memoryTypeBits & (1u << m)) && (mem_req.memoryTypeBits & (1u << m))) { + mem_type_idx = m; + break; + } + } + if (mem_type_idx == UINT32_MAX) { + close(dmabuf_fd); + throw vk::SystemError(vk::make_error_code(vk::Result::eErrorFormatNotSupported)); + } + + vk::ImportMemoryFdInfoKHR import_info{}; + import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eDmaBufEXT; + import_info.fd = dmabuf_fd; + vk::MemoryAllocateInfo alloc_info{}; + alloc_info.allocationSize = mem_req.size; + alloc_info.memoryTypeIndex = mem_type_idx; + alloc_info.setPNext(&import_info); + imported_mem = dev1->device.allocateMemory(alloc_info); + dev1->device.bindBufferMemory(imported_buffer, imported_mem, 0); + } catch (vk::SystemError& e) { + std::cerr << " dmabuf_p2p : SKIPPED (import: " << e.what() << ")" << std::endl; + if (imported_buffer) dev1->device.destroyBuffer(imported_buffer); + if (imported_mem) dev1->device.freeMemory(imported_mem); + imported_buffer = vk::Buffer{}; + imported_mem = vk::DeviceMemory{}; + setup_ok = false; + } + } + + if (setup_ok) { + std::vector times; + for (size_t i = 0; i < num_it + warmup; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + + vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(dev1, subctx); + VkBufferCopy bc{ 0, 0, size }; + vkCmdCopyBuffer(subctx->s->buffer->buf, imported_buffer, buf_dst->buffer, 1, &bc); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, dev1->fence); + VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "dmabuf_p2p"); + dev1->device.resetFences({ dev1->fence }); + + auto end = std::chrono::high_resolution_clock::now(); + if (i >= warmup) times.push_back(std::chrono::duration_cast(end - begin).count() / 1000.0); + } + record("dmabuf_p2p", size, times); + } + + if (imported_buffer) dev1->device.destroyBuffer(imported_buffer); + if (imported_mem) dev1->device.freeMemory(imported_mem); + if (exp_buffer) dev0->device.destroyBuffer(exp_buffer); + if (exp_mem) dev0->device.freeMemory(exp_mem); } #endif } @@ -13321,7 +13392,10 @@ static void ggml_vk_test_cross_device_copy(ggml_backend_vk_context * ctx) { std::vector devices(n_devices); for (size_t i = 0; i < n_devices; i++) { devices[i] = ggml_vk_get_device(i); - std::cerr << " [" << i << "] " << devices[i]->name << std::endl; + std::cerr << " [" << i << "] " << devices[i]->name + << " (ext_mem_host=" << devices[i]->external_memory_host + << " dma_buf=" << devices[i]->external_memory_dma_buf + << " sem_fd=" << devices[i]->external_semaphore_fd << ")" << std::endl; } const std::vector test_sizes = {