diff --git a/.gitattributes b/.gitattributes index e9f9b494a7..06c85ad56e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1,3 @@ -# Force LF line endings everywhere (prevent Windows CRLF conversion). -* text=auto eol=lf - # Treat the generated single-file WebUI build as binary for diff purposes. # Git's pack-file delta compression still works (byte-level), but this prevents # git diff from printing the entire minified file on every change. diff --git a/ggml/src/ggml-cuda/allreduce.cu b/ggml/src/ggml-cuda/allreduce.cu index f5f3c53dd1..43999b5d56 100644 --- a/ggml/src/ggml-cuda/allreduce.cu +++ b/ggml/src/ggml-cuda/allreduce.cu @@ -233,6 +233,10 @@ static __global__ void ggml_cuda_ar_kernel( // in-flight depth (single digits in practice) while keeping init cost low. static constexpr int GGML_CUDA_AR_POOL_SIZE = 128; +// Maximum chunk size (bytes per GPU) handled by one internal kernel launch. +// Larger tensors are reduced by issuing multiple chunked launches. +static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 256 * 1024; // 256 KB + // Byte spacing between adjacent arrival ints. 128 bytes (two cache lines) // ensures the arrival slots for the two GPUs never share a cache line, // preventing false-sharing stalls on the polling GPU. @@ -279,6 +283,41 @@ static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, return reinterpret_cast(p->arrival + offset); } +static int ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) { + const int slot = static_cast(p->call_count % GGML_CUDA_AR_POOL_SIZE); + const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE; + p->call_count++; + + if (pool_lapped) { + for (int i = 0; i < p->n_devices; ++i) { + ggml_cuda_set_device(p->devices[i]); + CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker)); + } + } + + for (int i = 0; i < p->n_devices; ++i) { + *ggml_cuda_ar_arrival_ptr(p, slot, i) = 0; + } + + return slot; +} + +static void ggml_cuda_ar_wait_for_compute( + ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) { + ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot]; + CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream())); + CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app)); +} + +static void ggml_cuda_ar_record_chunk_done( + ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot, bool last_chunk) { + ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot]; + CUDA_CHECK(cudaEventRecord(ev.ker, p->streams[rank])); + if (last_chunk) { + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), ev.ker)); + } +} + // --------------------------------------------------------------------------- // Background watchdog thread — monitors per-GPU debug ring buffers for new // bailout records. The kernel writes a record when it hits the spin limit; @@ -315,14 +354,51 @@ static void ggml_cuda_ar_wdog_thread(ggml_cuda_ar_pipeline * p) { std::this_thread::sleep_for(std::chrono::milliseconds(GGML_CUDA_AR_WDOG_POLL_MS)); } } + +static bool ggml_cuda_ar_wdog_init(ggml_cuda_ar_pipeline * p) { + for (int i = 0; i < p->n_devices; ++i) { + if (cudaHostAlloc(reinterpret_cast(&p->debug_ring[i]), + sizeof(ggml_cuda_ar_debug_ring), + cudaHostAllocPortable) != cudaSuccess) { + GGML_LOG_ERROR("%s: cudaHostAlloc for debug ring failed on device %d\n", + __func__, p->devices[i]); + return false; + } + memset(p->debug_ring[i], 0, sizeof(ggml_cuda_ar_debug_ring)); + } + + const char * spin_env = getenv("GGML_CUDA_AR_MAX_SPIN"); + p->wdog_max_spin = (spin_env && spin_env[0]) ? atoi(spin_env) : 0; + GGML_LOG_INFO("%s: AR watchdog enabled — max_spin=%d " + "(set GGML_CUDA_AR_MAX_SPIN= to adjust)\n", + __func__, p->wdog_max_spin); + + p->wdog_stop.store(false); + p->wdog_thr = std::thread(ggml_cuda_ar_wdog_thread, p); + return true; +} + +static void ggml_cuda_ar_wdog_stop(ggml_cuda_ar_pipeline * p) { + p->wdog_stop.store(true); + if (p->wdog_thr.joinable()) { + p->wdog_thr.join(); + } +} + +static void ggml_cuda_ar_wdog_free(ggml_cuda_ar_pipeline * p) { + for (int i = 0; i < p->n_devices; ++i) { + if (p->debug_ring[i]) { + cudaFreeHost(p->debug_ring[i]); + } + } +} #endif // GGML_CUDA_AR_WATCHDOG // --------------------------------------------------------------------------- // Init / free // --------------------------------------------------------------------------- -ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init( - const int * devices, size_t n_devices, size_t max_bytes) { +ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) { if ((n_devices != 2) || (n_devices > GGML_CUDA_MAX_DEVICES)) { return nullptr; @@ -386,49 +462,28 @@ ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init( memset(p->arrival, 0, arrival_bytes); // Per-device pinned staging buffers. - p->buf_bytes = max_bytes; + p->buf_bytes = GGML_CUDA_AR_MAX_BYTES; for (int i = 0; i < n_devices; ++i) { - if (cudaHostAlloc(&p->host_buf[i], max_bytes, cudaHostAllocPortable) != cudaSuccess) { + if (cudaHostAlloc(&p->host_buf[i], p->buf_bytes, cudaHostAllocPortable) != cudaSuccess) { GGML_LOG_ERROR("%s: cudaHostAlloc for staging failed (%zu bytes)\n", - __func__, max_bytes); + __func__, p->buf_bytes); ggml_cuda_ar_pipeline_free(p); return nullptr; } - memset(p->host_buf[i], 0, max_bytes); + memset(p->host_buf[i], 0, p->buf_bytes); } #if GGML_CUDA_AR_WATCHDOG - // Per-GPU debug ring buffers: written by the kernel on spin-limit bailout, - // polled by the background watchdog thread. Each ring is pinned host - // memory accessed only by its owning GPU (single-GPU host atomics OK). - { - for (int i = 0; i < n_devices; ++i) { - if (cudaHostAlloc(reinterpret_cast(&p->debug_ring[i]), - sizeof(ggml_cuda_ar_debug_ring), - cudaHostAllocPortable) != cudaSuccess) { - GGML_LOG_ERROR("%s: cudaHostAlloc for debug ring failed on device %d\n", - __func__, p->devices[i]); - ggml_cuda_ar_pipeline_free(p); - return nullptr; - } - memset(p->debug_ring[i], 0, sizeof(ggml_cuda_ar_debug_ring)); - } - - const char * spin_env = getenv("GGML_CUDA_AR_MAX_SPIN"); - p->wdog_max_spin = (spin_env && spin_env[0]) ? atoi(spin_env) : 0; - GGML_LOG_INFO("%s: AR watchdog enabled — max_spin=%d " - "(set GGML_CUDA_AR_MAX_SPIN= to adjust)\n", - __func__, p->wdog_max_spin); - - p->wdog_stop.store(false); - p->wdog_thr = std::thread(ggml_cuda_ar_wdog_thread, p); + if (!ggml_cuda_ar_wdog_init(p)) { + ggml_cuda_ar_pipeline_free(p); + return nullptr; } #endif GGML_LOG_INFO("%s: initialized AllReduce pipeline: %d GPUs, " "%zu KB staging per GPU\n", - __func__, n_devices, max_bytes >> 10); - + __func__, n_devices, p->buf_bytes >> 10); + return p; } @@ -440,10 +495,7 @@ void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) { #if GGML_CUDA_AR_WATCHDOG // Stop the watchdog thread first — it only reads pinned host memory, // no GPU resources, so this is safe and returns within ~1ms. - p->wdog_stop.store(true); - if (p->wdog_thr.joinable()) { - p->wdog_thr.join(); - } + ggml_cuda_ar_wdog_stop(p); #endif // Drain all in-flight kernels before tearing down resources. @@ -475,11 +527,7 @@ void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) { cudaFreeHost(p->arrival); } #if GGML_CUDA_AR_WATCHDOG - for (int i = 0; i < p->n_devices; ++i) { - if (p->debug_ring[i]) { - cudaFreeHost(p->debug_ring[i]); - } - } + ggml_cuda_ar_wdog_free(p); #endif delete p; } @@ -495,31 +543,18 @@ bool ggml_cuda_ar_allreduce( GGML_ASSERT(p != nullptr); const int n = p->n_devices; - - // Only the 2-GPU path is implemented; fall back for larger communicators. - if (n != 2) { - return false; - } + GGML_ASSERT(n == 2); const ggml_type type = tensors[0]->type; const size_t type_size = ggml_type_size(type); - - // Only float, half, and bfloat16 tensors are handled by the kernel. - if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) { - return false; - } + GGML_ASSERT(type == GGML_TYPE_F32 || type == GGML_TYPE_F16 || type == GGML_TYPE_BF16); const int64_t ne = ggml_nelements(tensors[0]); - - if (ne == 0) { - return true; - } - - if (p->buf_bytes < type_size) { - return false; - } + GGML_ASSERT(ne > 0); + GGML_ASSERT(p->buf_bytes >= type_size); const size_t max_chunk_elems = p->buf_bytes / type_size; + GGML_ASSERT(max_chunk_elems > 0); // Insert chunked kernels into each GPU's existing compute stream via events: // record(app, compute_stream) — capture "upstream done" @@ -532,36 +567,17 @@ bool ggml_cuda_ar_allreduce( const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems; const size_t chunk_bytes = chunk_elems * type_size; - // Cycle through the event pool. On the second pass through the ring, - // synchronise on the slot's ker event before touching arrival ints — - // the event and arrival pools wrap in lock-step so this guarantees the - // kernels which last used this slot have finished. - const int slot = static_cast(p->call_count % GGML_CUDA_AR_POOL_SIZE); - const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE; - p->call_count++; - - if (pool_lapped) { - for (int i = 0; i < n; ++i) { - ggml_cuda_set_device(p->devices[i]); - CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker)); - } - } - - // Reset the arrival ints for this slot before any kernel can read them. - for (int i = 0; i < n; ++i) { - *ggml_cuda_ar_arrival_ptr(p, slot, i) = 0; - } + const int slot = ggml_cuda_ar_acquire_slot(p); + const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne; for (int i = 0; i < n; ++i) { const int peer = 1 - i; // valid for n == 2 only ggml_cuda_set_device(p->devices[i]); auto * cuda_ctx = static_cast(backends[i]->context); - ggml_cuda_ar_event_slot & ev = p->ev_pool[i][slot]; const bool compute = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0; if (chunk_start == 0) { - CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream())); - CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], ev.app)); + ggml_cuda_ar_wait_for_compute(p, cuda_ctx, i, slot); } char * data = static_cast(tensors[i]->data) + chunk_start * (int64_t) type_size; @@ -600,10 +616,7 @@ bool ggml_cuda_ar_allreduce( #undef GGML_CUDA_AR_WDOG_EXTRA_ARGS CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaEventRecord(ev.ker, p->streams[i])); - if (chunk_start + (int64_t) chunk_elems == ne) { - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), ev.ker)); - } + ggml_cuda_ar_record_chunk_done(p, cuda_ctx, i, slot, last_chunk); } } diff --git a/ggml/src/ggml-cuda/allreduce.cuh b/ggml/src/ggml-cuda/allreduce.cuh index fbf1152179..1f7f9f41ba 100644 --- a/ggml/src/ggml-cuda/allreduce.cuh +++ b/ggml/src/ggml-cuda/allreduce.cuh @@ -5,21 +5,14 @@ #include -// Maximum chunk size (bytes per GPU) handled by one internal kernel launch. -// Larger tensors are reduced by issuing multiple chunked launches. -static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 256 * 1024; // 256 KB - // Opaque pipeline context — owns all pinned buffers, streams, and events. struct ggml_cuda_ar_pipeline; // Allocate a pipeline for n_devices GPUs. // devices[] holds the CUDA device IDs in rank order. -// max_bytes is the staging buffer size per device, which also bounds the -// per-launch chunk size; tensors larger than this are reduced via multiple -// chunked launches. // Returns nullptr on allocation failure. ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init( - const int * devices, size_t n_devices, size_t max_bytes); + const int * devices, size_t n_devices); // Release all resources owned by the pipeline. void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline); @@ -27,9 +20,8 @@ void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline); // Execute an in-place AllReduce (sum) across tensors[0..n_devices-1]. // tensors[i] must live on the device managed by backends[i] and be // contiguous F32, F16, or BF16. -// Returns true on success. Returns false when the tensor type or size is -// outside the currently supported range; the caller should fall back to -// another provider (NCCL or the meta-backend CPU reduce). +// Preconditions are checked by the CUDA comm dispatcher before calling this. +// Returns true once the reduction work has been enqueued successfully. bool ggml_cuda_ar_allreduce( ggml_cuda_ar_pipeline * pipeline, ggml_backend_t * backends, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9bec84551b..93308bcfbe 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1223,8 +1223,7 @@ static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_ba // Try to allocate the internal pipeline unless the user forced NCCL. if (!force_nccl) { - ret->ar_pipeline = ggml_cuda_ar_pipeline_init( - ret->dev_ids.data(), n_backends, GGML_CUDA_AR_MAX_BYTES); + ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), n_backends); if (ret->ar_pipeline == nullptr) { // Clear any sticky CUDA error from the failed init so it can't // leak into a later NCCL call. @@ -1336,7 +1335,7 @@ enum ggml_cuda_comm_allreduce_result { static ggml_cuda_comm_allreduce_result ggml_backend_cuda_comm_try_allreduce_internal( ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) { if (comm_ctx->ar_pipeline == nullptr) { - GGML_LOG_WARN("%s: internal unsupported: pipeline unavailable\n", __func__); + GGML_LOG_DEBUG("%s: internal unsupported: pipeline unavailable\n", __func__); return GGML_CUDA_COMM_ALLREDUCE_UNSUPPORTED; } @@ -1348,12 +1347,12 @@ static ggml_cuda_comm_allreduce_result ggml_backend_cuda_comm_try_allreduce_inte const ggml_type type = tensors[0]->type; if (n_backends != 2) { - GGML_LOG_WARN("%s: internal unsupported: n_backends=%zu\n", __func__, n_backends); + GGML_LOG_DEBUG("%s: internal unsupported: n_backends=%zu\n", __func__, n_backends); return GGML_CUDA_COMM_ALLREDUCE_UNSUPPORTED; } if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) { - GGML_LOG_WARN("%s: internal unsupported: type=%d\n", __func__, (int) type); + GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type); return GGML_CUDA_COMM_ALLREDUCE_UNSUPPORTED; } @@ -1372,14 +1371,14 @@ static ggml_cuda_comm_allreduce_result ggml_backend_cuda_comm_try_allreduce_inte return GGML_CUDA_COMM_ALLREDUCE_FAILED; } if (!ggml_is_contiguously_allocated(tensors[i])) { - GGML_LOG_WARN("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n", - __func__, i, ne, ggml_nbytes(tensors[i]), - (size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type); + GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n", + __func__, i, ne, ggml_nbytes(tensors[i]), + (size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type); return GGML_CUDA_COMM_ALLREDUCE_UNSUPPORTED; } if (((uintptr_t) tensors[i]->data & 0xF) != 0) { - GGML_LOG_WARN("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n", - __func__, i, tensors[i]->data, (int) type, ne); + GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n", + __func__, i, tensors[i]->data, (int) type, ne); return GGML_CUDA_COMM_ALLREDUCE_UNSUPPORTED; } }