Use dlopen to load NCCL. (#9796)

This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
2023-11-22 19:27:31 +08:00
parent fedd9674c8
commit 0715ab3c10
45 changed files with 658 additions and 268 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -7,8 +7,6 @@
 #include <cinttypes>                         // for strtoimax
 #include <cmath>                             // for nan
 #include <cstring>                           // for strcmp
-#include <fstream>                           // for operator<<, basic_ostream, ios, stringstream
-#include <functional>                        // for less
 #include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
 #include <memory>                            // for shared_ptr, allocator, __shared_ptr_access
@@ -22,7 +20,6 @@
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
 #include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
-#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
 #include "../data/ellpack_page.h"            // for EllpackPage
@@ -35,14 +32,12 @@
 #include "dmlc/parameter.h"                  // for FieldAccessEntry, FieldEntry, ParamManager
 #include "dmlc/thread_local.h"               // for ThreadLocalStore
 #include "rabit/c_api.h"                     // for RabitLinkTag
-#include "rabit/rabit.h"                     // for CheckPoint, LoadCheckPoint
 #include "xgboost/base.h"                    // for bst_ulong, bst_float, GradientPair, bst_feat...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for DMatrix, MetaInfo, DataType, ExtSparsePage
 #include "xgboost/feature_map.h"             // for FeatureMap
 #include "xgboost/global_config.h"           // for GlobalConfiguration, GlobalConfigThreadLocal...
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/intrusive_ptr.h"           // for xgboost
 #include "xgboost/json.h"                    // for Json, get, Integer, IsA, Boolean, String
 #include "xgboost/learner.h"                 // for Learner, PredictionType
 #include "xgboost/logging.h"                 // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
@@ -79,6 +74,7 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_CUDA"] = Boolean{false};
  info["USE_NCCL"] = Boolean{false};
  info["USE_RMM"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
 }
 }  // namespace xgboost
 #endif
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -33,8 +33,16 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_NCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["NCCL_VERSION"] = v;
+
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  info["USE_DLOPEN_NCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+
 #else
  info["USE_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
 #endif

 #if defined(XGBOOST_USE_RMM)
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -19,25 +19,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; }

 NCCLColl::~NCCLColl() = default;
 namespace {
-Result GetNCCLResult(ncclResult_t code) {
-  if (code == ncclSuccess) {
-    return Success();
-  }
-
-  std::stringstream ss;
-  ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-  if (code == ncclUnhandledCudaError) {
-    // nccl usually preserves the last error so we can get more details.
-    auto err = cudaPeekAtLastError();
-    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-  } else if (code == ncclSystemError) {
-    ss << "  This might be caused by a network configuration issue. Please consider specifying "
-          "the network interface for NCCL via environment variables listed in its reference: "
-          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-  }
-  return Fail(ss.str());
-}
-
 auto GetNCCLType(ArrayInterfaceHandler::Type type) {
  auto fatal = [] {
    LOG(FATAL) << "Invalid type for NCCL operation.";
@@ -94,11 +75,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
                                      common::Span<std::int8_t> data, Op op) {
  dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
  auto* device_buffer = buffer.data().get();
+  auto stub = pcomm->Stub();

  // First gather data from all the workers.
  CHECK(handle);
-  auto rc = GetNCCLResult(
-      ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream()));
+  auto rc = GetNCCLResult(stub, stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8,
+                                                handle, pcomm->Stream()));
  if (!rc.OK()) {
    return rc;
  }
@@ -149,6 +131,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  return Success() << [&] {
    if (IsBitwiseOp(op)) {
      return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
@@ -156,9 +140,9 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
      return DispatchDType(type, [=](auto t) {
        using T = decltype(t);
        auto rdata = common::RestoreType<T>(data);
-        auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
-                                GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
-        return GetNCCLResult(rc);
+        auto rc = stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
+                                  GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
+        return GetNCCLResult(stub, rc);
      });
    }
  } << [&] { return nccl->Block(); };
@@ -171,9 +155,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  return Success() << [&] {
-    return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
-                                       nccl->Handle(), nccl->Stream()));
+    return GetNCCLResult(stub, stub->Broadcast(data.data(), data.data(), data.size_bytes(),
+                                               ncclInt8, root, nccl->Handle(), nccl->Stream()));
  } << [&] { return nccl->Block(); };
 }

@@ -184,10 +170,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  auto send = data.subspan(comm.Rank() * size, size);
  return Success() << [&] {
-    return GetNCCLResult(
-        ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream()));
+    return GetNCCLResult(stub, stub->Allgather(send.data(), data.data(), size, ncclInt8,
+                                               nccl->Handle(), nccl->Stream()));
  } << [&] { return nccl->Block(); };
 }

@@ -199,19 +187,20 @@ namespace cuda_impl {
 */
 Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
                           common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
-  return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+  auto stub = comm->Stub();
+  return Success() << [&stub] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
    std::size_t offset = 0;
    for (std::int32_t r = 0; r < comm->World(); ++r) {
      auto as_bytes = sizes[r];
-      auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
-                              ncclInt8, r, comm->Handle(), dh::DefaultStream());
+      auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
+                                ncclInt8, r, comm->Handle(), dh::DefaultStream());
      if (rc != ncclSuccess) {
-        return GetNCCLResult(rc);
+        return GetNCCLResult(stub, rc);
      }
      offset += as_bytes;
    }
    return Success();
-  } << [] { return GetNCCLResult(ncclGroupEnd()); };
+  } << [&] { return GetNCCLResult(stub, stub->GroupEnd()); };
 }
 }  // namespace cuda_impl

@@ -224,10 +213,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
  if (!comm.IsDistributed()) {
    return Success();
  }
+  auto stub = nccl->Stub();

  switch (algo) {
    case AllgatherVAlgo::kRing: {
-      return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+      return Success() << [&] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
        // get worker offset
        detail::AllgatherVOffset(sizes, recv_segments);
        // copy data
@@ -237,8 +227,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
                                        cudaMemcpyDeviceToDevice, nccl->Stream()));
        }
        return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
-      } << [] {
-        return GetNCCLResult(ncclGroupEnd());
+      } << [&] {
+        return GetNCCLResult(stub, stub->GroupEnd());
      } << [&] { return nccl->Block(); };
    }
    case AllgatherVAlgo::kBcast: {
--- a/src/collective/coll.cuh
+++ b/src/collective/coll.cuh
@@ -8,7 +8,8 @@
 #include "../data/array_interface.h"  // for ArrayInterfaceHandler
 #include "coll.h"                     // for Coll
 #include "comm.h"                     // for Comm
-#include "xgboost/span.h"             // for Span
+#include "nccl_stub.h"
+#include "xgboost/span.h"  // for Span

 namespace xgboost::collective {
 class NCCLColl : public Coll {
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -7,15 +7,12 @@
 #include <chrono>     // for seconds
 #include <cstdlib>    // for exit
 #include <memory>     // for shared_ptr
-#include <mutex>      // for unique_lock
 #include <string>     // for string
 #include <utility>    // for move, forward

 #include "../common/common.h"           // for AssertGPUSupport
-#include "../common/json_utils.h"       // for OptionalArg
 #include "allgather.h"                  // for RingAllgather
 #include "protocol.h"                   // for kMagic
-#include "tracker.h"                    // for GetHostAddress
 #include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
 #include "xgboost/collective/socket.h"  // for TCPSocket
 #include "xgboost/json.h"               // for Json, Object
@@ -62,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
                            this->Rank(), this->World());
 }

-#if !defined(XGBOOST_USE_NCCL)
-Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
-  common::AssertGPUSupport();
-  common::AssertNCCLSupport();
-  return nullptr;
-}
-#endif  //  !defined(XGBOOST_USE_NCCL)
-
 [[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
                                    proto::PeerInfo ninfo, std::chrono::seconds timeout,
                                    std::int32_t retry,
@@ -194,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
 }

 RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-                     std::int32_t retry, std::string task_id)
-    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+                     std::int32_t retry, std::string task_id, StringView nccl_path)
+    : HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
+      nccl_path_{std::move(nccl_path)} {
  auto rc = this->Bootstrap(timeout_, retry_, task_id_);
  CHECK(rc.OK()) << rc.Report();
 }

+#if !defined(XGBOOST_USE_NCCL)
+Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
+  common::AssertGPUSupport();
+  common::AssertNCCLSupport();
+  return nullptr;
+}
+#endif  //  !defined(XGBOOST_USE_NCCL)
+
 [[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                          std::string task_id) {
  TCPSocket tracker;
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -13,19 +13,21 @@
 #include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for DefaultStream
 #include "../common/type.h"              // for EraseType
-#include "broadcast.h"                   // for Broadcast
 #include "comm.cuh"                      // for NCCLComm
 #include "comm.h"                        // for Comm
+#include "nccl_stub.h"                   // for NcclStub
 #include "xgboost/collective/result.h"   // for Result
 #include "xgboost/span.h"                // for Span

 namespace xgboost::collective {
 namespace {
-Result GetUniqueId(Comm const& comm, std::shared_ptr<Coll> coll, ncclUniqueId* pid) {
+Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared_ptr<Coll> coll,
+                   ncclUniqueId* pid) {
  static const int kRootRank = 0;
  ncclUniqueId id;
  if (comm.Rank() == kRootRank) {
-    dh::safe_nccl(ncclGetUniqueId(&id));
+    auto rc = GetNCCLResult(stub, stub->GetUniqueId(&id));
+    CHECK(rc.OK()) << rc.Report();
  }
  auto rc = coll->Broadcast(
      comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
@@ -54,11 +56,12 @@ static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> c
 }
 }  // namespace

-Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
-  return new NCCLComm{ctx, *this, pimpl};
+Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
+  return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}};
 }

-NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
+NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                   StringView nccl_path)
    : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
           root.TaskID()},
      stream_{ctx->CUDACtx()->Stream()} {
@@ -70,6 +73,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
  }

  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  stub_ = std::make_shared<NcclStub>(nccl_path);

  std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
@@ -95,19 +99,24 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
      << "Multiple processes within communication group running on same CUDA "
      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";

-  rc = GetUniqueId(root, pimpl, &nccl_unique_id_);
+  rc = std::move(rc) << [&] {
+    return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);
+  } << [&] {
+    return GetNCCLResult(this->stub_, this->stub_->CommInitRank(&nccl_comm_, root.World(),
+                                                                nccl_unique_id_, root.Rank()));
+  };
  CHECK(rc.OK()) << rc.Report();
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank()));

  for (std::int32_t r = 0; r < root.World(); ++r) {
    this->channels_.emplace_back(
-        std::make_shared<NCCLChannel>(root, r, nccl_comm_, dh::DefaultStream()));
+        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
  }
 }

 NCCLComm::~NCCLComm() {
  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
+    CHECK(rc.OK()) << rc.Report();
  }
 }
 }  // namespace xgboost::collective
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@@ -6,9 +6,13 @@
 #ifdef XGBOOST_USE_NCCL
 #include "nccl.h"
 #endif  // XGBOOST_USE_NCCL
+
+#include <utility>  // for move
+
 #include "../common/device_helpers.cuh"
 #include "coll.h"
 #include "comm.h"
+#include "nccl_stub.h"  // for NcclStub
 #include "xgboost/context.h"

 namespace xgboost::collective {
@@ -21,15 +25,20 @@ inline Result GetCUDAResult(cudaError rc) {
  return Fail(msg);
 }

+#if defined(XGBOOST_USE_NCCL)
 class NCCLComm : public Comm {
  ncclComm_t nccl_comm_{nullptr};
+  std::shared_ptr<NcclStub> stub_;
  ncclUniqueId nccl_unique_id_{};
  dh::CUDAStreamView stream_;
+  std::string nccl_path_;

 public:
  [[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
+  auto Stub() const { return stub_; }

-  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl);
+  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                    StringView nccl_path);
  [[nodiscard]] Result LogTracker(std::string) const override {
    LOG(FATAL) << "Device comm is used for logging.";
    return Fail("Undefined.");
@@ -43,25 +52,53 @@ class NCCLComm : public Comm {
  }
 };

+inline Result GetNCCLResult(std::shared_ptr<NcclStub> stub, ncclResult_t code) {
+  if (code == ncclSuccess) {
+    return Success();
+  }
+
+  std::stringstream ss;
+  ss << "NCCL failure: " << stub->GetErrorString(code) << ".";
+  if (code == ncclUnhandledCudaError) {
+    // nccl usually preserves the last error so we can get more details.
+    auto err = cudaPeekAtLastError();
+    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+  } else if (code == ncclSystemError) {
+    ss << "  This might be caused by a network configuration issue. Please consider specifying "
+          "the network interface for NCCL via environment variables listed in its reference: "
+          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
+  }
+  return Fail(ss.str());
+}
+
 class NCCLChannel : public Channel {
  std::int32_t rank_{-1};
  ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
  dh::CUDAStreamView stream_;

 public:
  explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
-                       dh::CUDAStreamView stream)
-      : rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {}
+                       std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
+      : rank_{rank},
+        nccl_comm_{nccl_comm},
+        stub_{std::move(stub)},
+        Channel{comm, nullptr},
+        stream_{stream} {}

  void SendAll(std::int8_t const* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    auto rc = GetNCCLResult(stub_, stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    CHECK(rc.OK()) << rc.Report();
  }
  void RecvAll(std::int8_t* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    auto rc = GetNCCLResult(stub_, stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    CHECK(rc.OK()) << rc.Report();
  }
  [[nodiscard]] Result Block() override {
    auto rc = stream_.Sync(false);
    return GetCUDAResult(rc);
  }
 };
+
+#endif  //  defined(XGBOOST_USE_NCCL)
 }  // namespace xgboost::collective
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
  return nrank;
 }

+inline StringView DefaultNcclName() { return "libnccl.so.2"; }
+
 class Channel;
 class Coll;

@@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this<Comm> {
  [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;

  [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
-
-  virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const;
 };

-class RabitComm : public Comm {
+/**
+ * @brief Base class for CPU-based communicator.
+ */
+class HostComm : public Comm {
+ public:
+  using Comm::Comm;
+  [[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx,
+                                          std::shared_ptr<Coll> pimpl) const = 0;
+};
+
+class RabitComm : public HostComm {
+  std::string nccl_path_ = std::string{DefaultNcclName()};
+
  [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                 std::string task_id);
  [[nodiscard]] Result Shutdown();
@@ -100,13 +112,15 @@ class RabitComm : public Comm {
  RabitComm() = default;
  // ctor for testing where environment is known.
  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-            std::int32_t retry, std::string task_id);
+            std::int32_t retry, std::string task_id, StringView nccl_path);
  ~RabitComm() noexcept(false) override;

  [[nodiscard]] bool IsFederated() const override { return false; }
  [[nodiscard]] Result LogTracker(std::string msg) const override;

  [[nodiscard]] Result SignalError(Result const&) override;
+
+  [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };

 /**
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@@ -37,7 +37,7 @@ namespace xgboost::collective {
 [[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
  if (device.IsCUDA()) {
    CHECK(ctx->IsCUDA());
-    if (!gpu_comm_) {
+    if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) {
      gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
    }
    return *gpu_comm_;
@@ -55,7 +55,6 @@ CommGroup::CommGroup()
  }

  std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
-  std::vector<std::string> keys;
  // Try both lower and upper case for compatibility
  auto get_param = [&](std::string name, auto dft, auto t) {
    std::string upper;
@@ -63,8 +62,6 @@ CommGroup::CommGroup()
                   [](char c) { return std::toupper(c); });
    std::transform(name.cbegin(), name.cend(), name.begin(),
                   [](char c) { return std::tolower(c); });
-    keys.push_back(upper);
-    keys.push_back(name);

    auto const& obj = get<Object const>(config);
    auto it = obj.find(upper);
@@ -75,19 +72,19 @@ CommGroup::CommGroup()
    }
  };
  // Common args
-  auto retry =
-      OptionalArg<Integer>(config, "dmlc_retry", static_cast<Integer::Int>(DefaultRetry()));
-  auto timeout = OptionalArg<Integer>(config, "dmlc_timeout_sec",
-                                      static_cast<Integer::Int>(DefaultTimeoutSec()));
+  auto retry = get_param("dmlc_retry", static_cast<Integer::Int>(DefaultRetry()), Integer{});
+  auto timeout =
+      get_param("dmlc_timeout_sec", static_cast<Integer::Int>(DefaultTimeoutSec()), Integer{});
  auto task_id = get_param("dmlc_task_id", std::string{}, String{});

  if (type == "rabit") {
    auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
    auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
+    auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
    auto ptr =
        new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
                          host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
-                          static_cast<std::int32_t>(retry), task_id}},
+                          static_cast<std::int32_t>(retry), task_id, nccl}},
                      std::shared_ptr<Coll>(new Coll{})};  // NOLINT
    return ptr;
  } else if (type == "federated") {
--- a/src/collective/comm_group.h
+++ b/src/collective/comm_group.h
@@ -17,14 +17,16 @@ namespace xgboost::collective {
 *        collective implementations.
 */
 class CommGroup {
-  std::shared_ptr<Comm> comm_;
+  std::shared_ptr<HostComm> comm_;
  mutable std::shared_ptr<Comm> gpu_comm_;

  std::shared_ptr<Coll> backend_;
  mutable std::shared_ptr<Coll> gpu_coll_;  // lazy initialization

  CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
-      : comm_{std::move(comm)}, backend_{std::move(coll)} {}
+      : comm_{std::dynamic_pointer_cast<HostComm>(comm)}, backend_{std::move(coll)} {
+    CHECK(comm_);
+  }

 public:
  CommGroup();
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -3,6 +3,7 @@
 */
 #include "communicator.h"

+#include "comm.h"
 #include "in_memory_communicator.h"
 #include "noop_communicator.h"
 #include "rabit_communicator.h"
@@ -14,8 +15,12 @@
 namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};
+thread_local std::string Communicator::nccl_path_{};

 void Communicator::Init(Json const& config) {
+  auto nccl = OptionalArg<String>(config, "dmlc_nccl_path", std::string{DefaultNcclName()});
+  nccl_path_ = nccl;
+
  auto type = GetTypeFromEnv();
  auto const arg = GetTypeFromConfig(config);
  if (arg != CommunicatorType::kUnknown) {
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
 #ifdef XGBOOST_USE_NCCL
    switch (type_) {
      case CommunicatorType::kRabit:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
        break;
      case CommunicatorType::kFederated:
      case CommunicatorType::kInMemory:
        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
        break;
      case CommunicatorType::kInMemoryNccl:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_));
        break;
      default:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
    }
 #else
    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -234,6 +234,7 @@ class Communicator {

  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
+  static thread_local std::string nccl_path_;
 #if defined(XGBOOST_USE_CUDA)
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -2,12 +2,14 @@
 * Copyright 2023 XGBoost contributors
 */
 #if defined(XGBOOST_USE_NCCL)
+#include "comm.cuh"
 #include "nccl_device_communicator.cuh"

 namespace xgboost {
 namespace collective {

-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync,
+                                               StringView nccl_path)
    : device_ordinal_{device_ordinal},
      needs_sync_{needs_sync},
      world_size_{GetWorldSize()},
@@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
  if (world_size_ == 1) {
    return;
  }
+  stub_ = std::make_shared<NcclStub>(std::move(nccl_path));

  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
@@ -43,7 +46,9 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy

  nccl_unique_id_ = GetUniqueId();
  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  auto rc =
+      GetNCCLResult(stub_, stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  CHECK(rc.OK()) << rc.Report();
 }

 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
@@ -51,7 +56,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
    return;
  }
  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
+    CHECK(rc.OK()) << rc.Report();
  }
  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
    LOG(CONSOLE) << "======== NCCL Statistics========";
@@ -137,8 +143,10 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
  auto *device_buffer = buffer.data().get();

  // First gather data from all the workers.
-  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+  auto rc = GetNCCLResult(
+      stub_, stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
                              nccl_comm_, dh::DefaultStream()));
+  CHECK(rc.OK()) << rc.Report();
  if (needs_sync_) {
    dh::DefaultStream().Sync();
  }
@@ -170,9 +178,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
  if (IsBitwiseOp(op)) {
    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
  } else {
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                dh::DefaultStream()));
+    auto rc = GetNCCLResult(stub_, stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
+                                                    GetNcclDataType(data_type), GetNcclRedOp(op),
+                                                    nccl_comm_, dh::DefaultStream()));
+    CHECK(rc.OK()) << rc.Report();
  }
  allreduce_bytes_ += count * GetTypeSize(data_type);
  allreduce_calls_ += 1;
@@ -185,8 +194,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
  }

  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
-                              dh::DefaultStream()));
+  auto rc = GetNCCLResult(stub_, stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8,
+                                                  nccl_comm_, dh::DefaultStream()));
+  CHECK(rc.OK()) << rc.Report();
 }

 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
@@ -206,14 +216,19 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
  receive_buffer->resize(total_bytes);

  size_t offset = 0;
-  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size_; ++i) {
-    size_t as_bytes = segments->at(i);
-    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
-    offset += as_bytes;
-  }
-  dh::safe_nccl(ncclGroupEnd());
+  auto rc = Success() << [&] { return GetNCCLResult(stub_, stub_->GroupStart()); } << [&] {
+    for (int32_t i = 0; i < world_size_; ++i) {
+      size_t as_bytes = segments->at(i);
+      auto rc = GetNCCLResult(
+          stub_, stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                  ncclChar, i, nccl_comm_, dh::DefaultStream()));
+      if (!rc.OK()) {
+        return rc;
+      }
+      offset += as_bytes;
+    }
+    return Success();
+  } << [&] { return GetNCCLResult(stub_, stub_->GroupEnd()); };
 }

 void NcclDeviceCommunicator::Synchronize() {
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -4,8 +4,10 @@
 #pragma once

 #include "../common/device_helpers.cuh"
+#include "comm.cuh"
 #include "communicator.h"
 #include "device_communicator.cuh"
+#include "nccl_stub.h"

 namespace xgboost {
 namespace collective {
@@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   * needed. The in-memory communicator is used in tests with multiple threads, each thread
   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
   */
-  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path);
  ~NcclDeviceCommunicator() override;
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override;
@@ -64,7 +66,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    static const int kRootRank = 0;
    ncclUniqueId id;
    if (rank_ == kRootRank) {
-      dh::safe_nccl(ncclGetUniqueId(&id));
+      auto rc = GetNCCLResult(stub_, stub_->GetUniqueId(&id));
+      CHECK(rc.OK()) << rc.Report();
    }
    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
    return id;
@@ -78,6 +81,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  int const world_size_;
  int const rank_;
  ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
  ncclUniqueId nccl_unique_id_{};
  size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
  size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
--- a/src/collective/nccl_stub.cc
+++ b/src/collective/nccl_stub.cc
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(XGBOOST_USE_NCCL)
+#include "nccl_stub.h"
+
+#include <cuda.h>   // for CUDA_VERSION
+#include <dlfcn.h>  // for dlclose, dlsym, dlopen
+#include <nccl.h>
+
+#include <cstdint>  // for int32_t
+#include <sstream>  // for stringstream
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/logging.h"
+
+namespace xgboost::collective {
+NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  CHECK(!path_.empty()) << "Empty path for NCCL.";
+
+  auto cu_major = (CUDA_VERSION) / 1000;
+  std::stringstream ss;
+  ss << R"m(
+
+If XGBoost is installed from PyPI with pip, the error can fixed by:
+
+- Run `pip install nvidia-nccl-cu)m"
+     << cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ").";
+  ss << R"m(
+
+Otherwise, please refer to:
+
+  https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting
+
+for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package
+no long bundles NCCL in the binary wheel.
+
+)m";
+  auto help = ss.str();
+  std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n  "};
+
+  auto safe_load = [&](auto t, StringView name) {
+    std::stringstream errs;
+    auto ptr = reinterpret_cast<decltype(t)>(dlsym(handle_, name.c_str()));
+    if (!ptr) {
+      errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n  "
+           << dlerror() << help;
+      LOG(FATAL) << errs.str();
+    }
+    return ptr;
+  };
+
+  handle_ = dlopen(path_.c_str(), RTLD_LAZY);
+  if (!handle_) {
+    LOG(FATAL) << msg << dlerror() << help;
+  }
+
+  allreduce_ = safe_load(allreduce_, "ncclAllReduce");
+  broadcast_ = safe_load(broadcast_, "ncclBroadcast");
+  allgather_ = safe_load(allgather_, "ncclAllGather");
+  comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
+  comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
+  get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
+  send_ = safe_load(send_, "ncclSend");
+  recv_ = safe_load(recv_, "ncclRecv");
+  group_start_ = safe_load(group_start_, "ncclGroupStart");
+  group_end_ = safe_load(group_end_, "ncclGroupEnd");
+  get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString");
+  get_version_ = safe_load(get_version_, "ncclGetVersion");
+
+  std::int32_t v;
+  CHECK_EQ(get_version_(&v), ncclSuccess);
+  auto patch = v % 100;
+  auto minor = (v / 100) % 100;
+  auto major = v / 10000;
+
+  LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_
+            << "`" << std::endl;
+#else
+  allreduce_ = ncclAllReduce;
+  broadcast_ = ncclBroadcast;
+  allgather_ = ncclAllGather;
+  comm_init_rank_ = ncclCommInitRank;
+  comm_destroy_ = ncclCommDestroy;
+  get_uniqueid_ = ncclGetUniqueId;
+  send_ = ncclSend;
+  recv_ = ncclRecv;
+  group_start_ = ncclGroupStart;
+  group_end_ = ncclGroupEnd;
+  get_error_string_ = ncclGetErrorString;
+  get_version_ = ncclGetVersion;
+#endif
+};
+
+NcclStub::~NcclStub() {  // NOLINT
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  if (handle_) {
+    auto rc = dlclose(handle_);
+    if (rc != 0) {
+      LOG(WARNING) << "Failed to close NCCL handle:" << dlerror();
+    }
+  }
+  handle_ = nullptr;
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+}
+}  // namespace xgboost::collective
+#endif  // defined(XGBOOST_USE_NCCL)
--- a/src/collective/nccl_stub.h
+++ b/src/collective/nccl_stub.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#if defined(XGBOOST_USE_NCCL)
+#include <cuda_runtime_api.h>
+#include <nccl.h>
+
+#include <string>  // for string
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::collective {
+class NcclStub {
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  void* handle_{nullptr};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+  std::string path_;
+
+  decltype(ncclAllReduce)* allreduce_{nullptr};
+  decltype(ncclBroadcast)* broadcast_{nullptr};
+  decltype(ncclAllGather)* allgather_{nullptr};
+  decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
+  decltype(ncclCommDestroy)* comm_destroy_{nullptr};
+  decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
+  decltype(ncclSend)* send_{nullptr};
+  decltype(ncclRecv)* recv_{nullptr};
+  decltype(ncclGroupStart)* group_start_{nullptr};
+  decltype(ncclGroupEnd)* group_end_{nullptr};
+  decltype(ncclGetErrorString)* get_error_string_{nullptr};
+  decltype(ncclGetVersion)* get_version_{nullptr};
+
+ public:
+  explicit NcclStub(StringView path);
+  ~NcclStub();
+
+  [[nodiscard]] ncclResult_t Allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                                       ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+                                       cudaStream_t stream) const {
+    CHECK(allreduce_);
+    return this->allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t Broadcast(const void* sendbuff, void* recvbuff, size_t count,
+                                       ncclDataType_t datatype, int root, ncclComm_t comm,
+                                       cudaStream_t stream) const {
+    CHECK(broadcast_);
+    return this->broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                                       ncclDataType_t datatype, ncclComm_t comm,
+                                       cudaStream_t stream) const {
+    CHECK(allgather_);
+    return this->allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
+                                          int rank) const {
+    CHECK(comm_init_rank_);
+    return this->comm_init_rank_(comm, nranks, commId, rank);
+  }
+  [[nodiscard]] ncclResult_t CommDestroy(ncclComm_t comm) const {
+    CHECK(comm_destroy_);
+    return this->comm_destroy_(comm);
+  }
+
+  [[nodiscard]] ncclResult_t GetUniqueId(ncclUniqueId* uniqueId) const {
+    CHECK(get_uniqueid_);
+    return this->get_uniqueid_(uniqueId);
+  }
+  [[nodiscard]] ncclResult_t Send(const void* sendbuff, size_t count, ncclDataType_t datatype,
+                                  int peer, ncclComm_t comm, cudaStream_t stream) {
+    CHECK(send_);
+    return send_(sendbuff, count, datatype, peer, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+                                  ncclComm_t comm, cudaStream_t stream) const {
+    CHECK(recv_);
+    return recv_(recvbuff, count, datatype, peer, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t GroupStart() const {
+    CHECK(group_start_);
+    return group_start_();
+  }
+  [[nodiscard]] ncclResult_t GroupEnd() const {
+    CHECK(group_end_);
+    return group_end_();
+  }
+
+  [[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
+    return get_error_string_(result);
+  }
+};
+}  // namespace xgboost::collective
+
+#endif  // defined(XGBOOST_USE_NCCL)
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -115,30 +115,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {

-#ifdef XGBOOST_USE_NCCL
-#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
-  if (code != ncclSuccess) {
-    std::stringstream ss;
-    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-    ss << " " << file << "(" << line << ")\n";
-    if (code == ncclUnhandledCudaError) {
-      // nccl usually preserves the last error so we can get more details.
-      auto err = cudaPeekAtLastError();
-      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-    } else if (code == ncclSystemError) {
-      ss << "  This might be caused by a network configuration issue. Please consider specifying "
-            "the network interface for NCCL via environment variables listed in its reference: "
-            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-    }
-    LOG(FATAL) << ss.str();
-  }
-
-  return code;
-}
-#endif
-
 inline int32_t CudaGetPointerDevice(void const *ptr) {
  int32_t device = -1;
  cudaPointerAttributes attr;