rework reduction provider selection. internal/nccl is OS dependent; most fallbacks are removed

2026-05-11 19:44:06 +00:00 · 2026-05-05 17:59:20 -07:00
parent 5d1b22ab3d
commit 727b141c02
1 changed files with 131 additions and 102 deletions
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -86,6 +86,9 @@

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

+#define GGML_LOG_WARN_ONCE(str) \
+    { static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
+
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
@@ -1151,19 +1154,16 @@ struct ggml_backend_cuda_comm_context {
    std::vector<ggml_backend_t> backends;
    std::vector<int>            dev_ids;

-    // Internal AR pipeline.  Allocated lazily on first try_allreduce_internal
-    // call so a config that lives entirely on NCCL never spends VRAM / pinned
-    // host memory on it.  Guarded by ar_pipeline_init_flag (std::call_once).
-    std::once_flag              ar_pipeline_init_flag;
-    ggml_cuda_ar_pipeline *     ar_pipeline = nullptr;
-
-    // Provider chosen at init time from GGML_CUDA_ALLREDUCE; called directly
-    // by the dispatch.  One of try_allreduce_{internal,nccl,none,os}.
+    // Set in comm_init to one of try_allreduce_{nccl, internal_strict,
+    // internal_lenient, butterfly} based on GGML_CUDA_ALLREDUCE and the
+    // platform.  Each variant assumes the resources it needs were
+    // initialised in comm_init; nccl needs `comms`, both internal variants
+    // need `ar_pipeline`, butterfly needs nothing.
    try_allreduce_fn            try_allreduce = nullptr;

+    ggml_cuda_ar_pipeline *     ar_pipeline = nullptr;
+
 #ifdef GGML_USE_NCCL
-    std::once_flag              nccl_init_flag;
-    bool                        nccl_init_ok = false;
    std::vector<ncclComm_t>     comms;
 #endif

@@ -1254,40 +1254,19 @@ static bool ggml_backend_cuda_comm_allreduce_nccl(
 }
 #endif // GGML_USE_NCCL

-// Lazily initialise the internal AR pipeline on first use.  Returns true
-// when the pipeline is ready; false if init failed (e.g. n_devices != 2 or
-// pre-Ampere) — caller falls through to the next provider.
-static bool ggml_backend_cuda_comm_ensure_internal(ggml_backend_cuda_comm_context * comm_ctx) {
-    std::call_once(comm_ctx->ar_pipeline_init_flag, [&] {
-        comm_ctx->ar_pipeline = ggml_cuda_ar_pipeline_init(
-            comm_ctx->dev_ids.data(), comm_ctx->dev_ids.size());
-        if (comm_ctx->ar_pipeline == nullptr) {
-            // Clear any sticky CUDA error from the failed init so it can't
-            // leak into a later NCCL call.
-            (void) cudaGetLastError();
-        }
-    });
-    return comm_ctx->ar_pipeline != nullptr;
-}
-
-// Try the internal AllReduce.  Returns true on success.  Returns false when
-// the pipeline is unavailable or the input is unsupported, so the caller
-// falls through to the next provider.  Tensor-shape errors are logged but
-// still return false (the meta-backend butterfly will catch them).
-static bool ggml_backend_cuda_comm_try_allreduce_internal(
+// Run the internal AR pipeline.  Returns false on unsupported / failed input
+// — the caller decides whether to abort (env-forced) or fall back silently.
+static bool ggml_backend_cuda_comm_allreduce_internal(
        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
-    if (!ggml_backend_cuda_comm_ensure_internal(comm_ctx)) {
-        return false;
-    }
+    GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);

    const size_t n_backends = comm_ctx->backends.size();
-    GGML_ASSERT(n_backends >= 1);
+    GGML_ASSERT(n_backends == 2);
    GGML_ASSERT(tensors[0] != nullptr);

    const int64_t   ne   = ggml_nelements(tensors[0]);
    const ggml_type type = tensors[0]->type;

-    GGML_ASSERT(n_backends == 2);
    if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
        GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
        return false;
@@ -1323,53 +1302,50 @@ static bool ggml_backend_cuda_comm_try_allreduce_internal(
    return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
 }

-#ifdef GGML_USE_NCCL
-// Lazily initialise NCCL communicators on first use.
-// Returns true when comms are ready; false if init failed.
-static bool ggml_backend_cuda_comm_ensure_nccl(ggml_backend_cuda_comm_context * comm_ctx) {
-    std::call_once(comm_ctx->nccl_init_flag, [&] {
-        const size_t n = comm_ctx->dev_ids.size();
-        comm_ctx->comms.resize(n);
-        ncclResult_t rc = ncclCommInitAll(comm_ctx->comms.data(), (int) n, comm_ctx->dev_ids.data());
-        if (rc != ncclSuccess) {
-            GGML_LOG_ERROR("%s: ncclCommInitAll failed: %s\n", __func__, ncclGetErrorString(rc));
-            comm_ctx->comms.clear();
-            return;
-        }
-        comm_ctx->nccl_init_ok = true;
-    });
-    return comm_ctx->nccl_init_ok;
-}
+// ---------------------------------------------------------------------------
+// try_allreduce variants — one per mode.  All assume their required resource
+// has already been initialised by comm_init.
+// ---------------------------------------------------------------------------

+// NCCL-only.  Used for env=nccl on any platform AND for the Linux default.
+// On NCCL-internal failure, ggml_backend_cuda_comm_allreduce_nccl aborts via
+// NCCL_CHECK; we'll only get here on success.
 static bool ggml_backend_cuda_comm_try_allreduce_nccl(
        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
-    if (!ggml_backend_cuda_comm_ensure_nccl(comm_ctx)) {
-        return false;
-    }
+#ifdef GGML_USE_NCCL
+    GGML_ASSERT(!comm_ctx->comms.empty());
    return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
-}
 #else
-static bool ggml_backend_cuda_comm_try_allreduce_nccl(
-        ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
+    GGML_UNUSED(comm_ctx); GGML_UNUSED(tensors);
+    GGML_ABORT("try_allreduce_nccl unreachable: built without NCCL");
+#endif
+}
+
+// Internal-only (env=internal).  Failure aborts so the user knows their
+// requested mode is not viable.
+static bool ggml_backend_cuda_comm_try_allreduce_internal_strict(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    if (ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors)) {
+        return true;
+    }
+    GGML_ABORT("GGML_CUDA_ALLREDUCE=internal: AR call failed (unsupported input).  "
+               "Reset the environment variable to use the platform default.");
+}
+
+// Internal with butterfly fallback.  Used for the Windows default — internal
+// is preferred but a return-false cleanly hits the meta-backend's butterfly.
+static bool ggml_backend_cuda_comm_try_allreduce_internal_lenient(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    if (ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors)) {
+        return true;
+    }
+    GGML_LOG_WARN_ONCE("internal AllReduce call failed; "
+                       "meta-backend butterfly will be used for this and subsequent calls\n");
    return false;
 }
-#endif

-// Platform-tuned default order.  Each helper returns false if its provider is
-// unavailable, so the short-circuit OR naturally falls through to the next.
-static bool ggml_backend_cuda_comm_try_allreduce_os(
-        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
-#if defined(__linux__)
-    return ggml_backend_cuda_comm_try_allreduce_nccl(comm_ctx, tensors)
-        || ggml_backend_cuda_comm_try_allreduce_internal(comm_ctx, tensors);
-#else
-    return ggml_backend_cuda_comm_try_allreduce_internal(comm_ctx, tensors)
-        || ggml_backend_cuda_comm_try_allreduce_nccl(comm_ctx, tensors);
-#endif
-}
-
-// "GGML_CUDA_ALLREDUCE=none": skip CUDA providers, fall back to butterfly.
-static bool ggml_backend_cuda_comm_try_allreduce_none(
+// Butterfly-only (env=none, or after a failed init for non-strict modes).
+static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
        ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
    return false;
 }
@@ -1381,10 +1357,36 @@ static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
    delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
 }

-// Create the comm context.  Internal AllReduce is allocated unconditionally
-// (warning on failure).  GGML_CUDA_ALLREDUCE is read here exactly once and
-// used to pick the per-call provider function; runtime fallback to butterfly
-// happens naturally if the chosen provider can't serve the call.
+// Resource initializers — return true on success.
+
+#ifdef GGML_USE_NCCL
+static bool ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ctx) {
+    const size_t n = ctx->dev_ids.size();
+    ctx->comms.resize(n);
+    ncclResult_t rc = ncclCommInitAll(ctx->comms.data(), (int) n, ctx->dev_ids.data());
+    if (rc != ncclSuccess) {
+        ctx->comms.clear();
+        GGML_LOG_ERROR("%s: ncclCommInitAll failed: %s\n", __func__, ncclGetErrorString(rc));
+        return false;
+    }
+    return true;
+}
+#endif
+
+static bool ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ctx) {
+    ctx->ar_pipeline = ggml_cuda_ar_pipeline_init(ctx->dev_ids.data(), ctx->dev_ids.size());
+    if (ctx->ar_pipeline == nullptr) {
+        // Clear sticky CUDA error from the failed init.
+        (void) cudaGetLastError();
+        return false;
+    }
+    return true;
+}
+
+// Pick the try_allreduce function pointer based on GGML_CUDA_ALLREDUCE / OS,
+// then init the resource that pointer needs (NCCL or internal pipeline).
+// Init failure aborts in every case — internal-lenient's "fall back to
+// butterfly" applies to per-call failures, not init.
 static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
    for (size_t i = 0; i < n_backends; i++) {
        if (!ggml_backend_is_cuda(backends[i])) {
@@ -1399,35 +1401,62 @@ static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_ba
        ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
    }

-    // The internal AR pipeline is created on first try_allreduce_internal
-    // call (see ensure_internal) so a config that lives entirely on NCCL
-    // never spends resources on it.
-
-    // Eager init and warning on targets where NCCL is the preferred provider
-#if defined(GGML_USE_NCCL) && !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32)
-    if (!ggml_backend_cuda_comm_ensure_nccl(ret))
-    {
-        static bool warning_printed = false;
-        if (!warning_printed) {
-            GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, "
-                          "multi GPU performance will be suboptimal\n", __func__);
-            warning_printed = true;
-        }
-    }
+    // 1. Pick the function pointer.
+    const char * env = getenv("GGML_CUDA_ALLREDUCE");
+    const bool env_nccl     = env && strcmp(env, "nccl")     == 0;
+    const bool env_internal = env && strcmp(env, "internal") == 0;
+    const bool env_none     = env && strcmp(env, "none")     == 0;
+    
+    if      (env_nccl)     ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
+    else if (env_internal) ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal_strict;
+    else if (env_none)     ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
+#if defined(_WIN32)
+    else                   ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal_lenient;
+#elif defined(__linux__)
+    else                   ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
+#else
+    else GGML_ABORT("no AllReduce default for this platform; set GGML_CUDA_ALLREDUCE explicitly");
 #endif

-    const char * env = getenv("GGML_CUDA_ALLREDUCE");
-    if      (env && strcmp(env, "internal") == 0) ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
-    else if (env && strcmp(env, "nccl")     == 0) ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
-    else if (env && strcmp(env, "none")     == 0) ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_none;
-    else                                          ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_os;
+    // 2. Init the matching resource.  Strict modes (env-forced or Linux
+    //    default = NCCL) abort on failure.  The Windows-default lenient
+    //    internal mode degrades to butterfly on init failure.  Linux
+    //    without NCCL compiled in degrades to butterfly with a warning.
+    if (ret->try_allreduce == ggml_backend_cuda_comm_try_allreduce_nccl) {
+#ifdef GGML_USE_NCCL
+        if (!ggml_backend_cuda_comm_init_nccl(ret)) {
+            GGML_ABORT("NCCL init failed.  Set GGML_CUDA_ALLREDUCE=internal or =none to bypass.");
+        }
+#else
+        if (env_nccl) {
+            GGML_ABORT("GGML_CUDA_ALLREDUCE=nccl requested but llama.cpp was not built with NCCL.  "
+                       "Recompile with -DGGML_CUDA_NCCL=ON or reset the environment variable.");
+        }
+        // Linux default with no NCCL compiled: warn and degrade to butterfly.
+        GGML_LOG_WARN_ONCE("NVIDIA Collective Communications Library (NCCL) is unavailable; "
+                           "multi-GPU performance will be suboptimal.  "
+                           "Recompile with -DGGML_CUDA_NCCL=ON for best performance.");
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
+#endif
+    } else if (ret->try_allreduce == ggml_backend_cuda_comm_try_allreduce_internal_strict) {
+        if (!ggml_backend_cuda_comm_init_internal(ret)) {
+            GGML_ABORT("internal AllReduce pipeline init failed (n_devices != 2 or pre-Ampere?).  "
+                       "Reset GGML_CUDA_ALLREDUCE to use the platform default.");
+        }
+    } else if (ret->try_allreduce == ggml_backend_cuda_comm_try_allreduce_internal_lenient) {
+        if (!ggml_backend_cuda_comm_init_internal(ret)) {
+            GGML_LOG_WARN_ONCE("internal AllReduce pipeline init failed (n_devices != 2 or pre-Ampere?); "
+                               "meta-backend butterfly will be used");
+            ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
+        }
+    }
+    // else: butterfly, no init needed.

    return ret;
 }

-// Top-level dispatch.  Just calls the function pointer comm_init picked from
-// GGML_CUDA_ALLREDUCE.  Returns false to fall back to the meta backend's
-// butterfly reduction.
+// Top-level dispatch — calls the function pointer chosen by comm_init.
+// Returns false to let the meta-backend's butterfly run.
 static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
    if (comm_ctx_v == nullptr) {
        return false;