Use dlopen to load NCCL. (#9796)
This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
This commit is contained in:
@@ -5,9 +5,11 @@
|
||||
|
||||
#include <memory> // for shared_ptr
|
||||
|
||||
#include "../../src/collective/coll.h" // for Coll
|
||||
#include "../../src/common/device_helpers.cuh" // for CUDAStreamView
|
||||
#include "federated_comm.h" // for FederatedComm
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
namespace xgboost::collective {
|
||||
class CUDAFederatedComm : public FederatedComm {
|
||||
@@ -16,5 +18,9 @@ class CUDAFederatedComm : public FederatedComm {
|
||||
public:
|
||||
explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
|
||||
[[nodiscard]] auto Stream() const { return stream_; }
|
||||
Comm* MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const override {
|
||||
LOG(FATAL) << "[Internal Error]: Invalid request for CUDA variant.";
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -10,12 +10,12 @@
|
||||
#include <memory> // for unique_ptr
|
||||
#include <string> // for string
|
||||
|
||||
#include "../../src/collective/comm.h" // for Comm
|
||||
#include "../../src/collective/comm.h" // for HostComm
|
||||
#include "../../src/common/json_utils.h" // for OptionalArg
|
||||
#include "xgboost/json.h"
|
||||
|
||||
namespace xgboost::collective {
|
||||
class FederatedComm : public Comm {
|
||||
class FederatedComm : public HostComm {
|
||||
std::shared_ptr<federated::Federated::Stub> stub_;
|
||||
|
||||
void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank,
|
||||
@@ -64,6 +64,6 @@ class FederatedComm : public Comm {
|
||||
[[nodiscard]] bool IsFederated() const override { return true; }
|
||||
[[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
|
||||
|
||||
Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
|
||||
[[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
|
||||
};
|
||||
} // namespace xgboost::collective
|
||||
|
||||
Reference in New Issue
Block a user