Use dlopen to load NCCL. (#9796)
This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
This commit is contained in:
@@ -90,10 +90,10 @@ class Worker : public NCCLWorkerForTest {
|
||||
}
|
||||
};
|
||||
|
||||
class AllgatherTestGPU : public SocketTest {};
|
||||
class MGPUAllgatherTest : public SocketTest {};
|
||||
} // namespace
|
||||
|
||||
TEST_F(AllgatherTestGPU, MGPUTestVRing) {
|
||||
TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
@@ -104,7 +104,7 @@ TEST_F(AllgatherTestGPU, MGPUTestVRing) {
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllgatherTestGPU, MGPUTestVBcast) {
|
||||
TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
|
||||
@@ -5,17 +5,15 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/host_vector.h> // for host_vector
|
||||
|
||||
#include "../../../src/collective/coll.h" // for Coll
|
||||
#include "../../../src/common/common.h"
|
||||
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "../helpers.h" // for MakeCUDACtx
|
||||
#include "test_worker.cuh" // for NCCLWorkerForTest
|
||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace {
|
||||
class AllreduceTestGPU : public SocketTest {};
|
||||
class MGPUAllreduceTest : public SocketTest {};
|
||||
|
||||
class Worker : public NCCLWorkerForTest {
|
||||
public:
|
||||
@@ -47,7 +45,7 @@ class Worker : public NCCLWorkerForTest {
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(AllreduceTestGPU, BitOr) {
|
||||
TEST_F(MGPUAllreduceTest, BitOr) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
@@ -57,7 +55,7 @@ TEST_F(AllreduceTestGPU, BitOr) {
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllreduceTestGPU, Sum) {
|
||||
TEST_F(MGPUAllreduceTest, Sum) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <bitset>
|
||||
#include <string> // for string
|
||||
|
||||
#include "../../../src/collective/comm.cuh"
|
||||
#include "../../../src/collective/communicator-inl.cuh"
|
||||
#include "../../../src/collective/nccl_device_communicator.cuh"
|
||||
#include "../helpers.h"
|
||||
@@ -16,17 +17,15 @@ namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
||||
auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
|
||||
auto construct = []() { NcclDeviceCommunicator comm{-1, false, DefaultNcclName()}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
|
||||
try {
|
||||
dh::safe_nccl(ncclSystemError);
|
||||
} catch (dmlc::Error const& e) {
|
||||
auto str = std::string{e.what()};
|
||||
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
|
||||
}
|
||||
auto stub = std::make_shared<NcclStub>(DefaultNcclName());
|
||||
auto rc = GetNCCLResult(stub, ncclSystemError);
|
||||
auto msg = rc.Report();
|
||||
ASSERT_TRUE(msg.find("environment variables") != std::string::npos);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -33,7 +33,7 @@ class WorkerForTest {
|
||||
tracker_port_{port},
|
||||
world_size_{world},
|
||||
task_id_{"t:" + std::to_string(rank)},
|
||||
comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} {
|
||||
comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
|
||||
CHECK_EQ(world_size_, comm_.World());
|
||||
}
|
||||
virtual ~WorkerForTest() = default;
|
||||
|
||||
Reference in New Issue
Block a user