This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
118 lines
4.5 KiB
Plaintext
118 lines
4.5 KiB
Plaintext
/**
|
|
* Copyright 2023, XGBoost Contributors
|
|
*/
|
|
#if defined(XGBOOST_USE_NCCL)
|
|
#include <gtest/gtest.h>
|
|
#include <thrust/device_vector.h> // for device_vector
|
|
#include <thrust/equal.h> // for equal
|
|
#include <xgboost/span.h> // for Span
|
|
|
|
#include <cstddef> // for size_t
|
|
#include <cstdint> // for int32_t, int64_t
|
|
#include <vector> // for vector
|
|
|
|
#include "../../../src/collective/allgather.h" // for RingAllgather
|
|
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
|
#include "../../../src/common/type.h" // for EraseType
|
|
#include "test_worker.cuh" // for NCCLWorkerForTest
|
|
#include "test_worker.h" // for TestDistributed, WorkerForTest
|
|
|
|
namespace xgboost::collective {
|
|
namespace {
|
|
class Worker : public NCCLWorkerForTest {
|
|
public:
|
|
using NCCLWorkerForTest::NCCLWorkerForTest;
|
|
|
|
void TestV(AllgatherVAlgo algo) {
|
|
{
|
|
// basic test
|
|
std::size_t n = 1;
|
|
// create data
|
|
dh::device_vector<std::int32_t> data(n, comm_.Rank());
|
|
auto s_data = common::EraseType(common::Span{data.data().get(), data.size()});
|
|
// get size
|
|
std::vector<std::int64_t> sizes(comm_.World(), -1);
|
|
sizes[comm_.Rank()] = s_data.size_bytes();
|
|
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
// create result
|
|
dh::device_vector<std::int32_t> result(comm_.World(), -1);
|
|
auto s_result = common::EraseType(dh::ToSpan(result));
|
|
|
|
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
|
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
|
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
|
|
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
|
ASSERT_EQ(result[i], i);
|
|
}
|
|
}
|
|
{
|
|
// V test
|
|
std::size_t n = 256 * 256;
|
|
// create data
|
|
dh::device_vector<std::int32_t> data(n * nccl_comm_->Rank(), nccl_comm_->Rank());
|
|
auto s_data = common::EraseType(common::Span{data.data().get(), data.size()});
|
|
// get size
|
|
std::vector<std::int64_t> sizes(nccl_comm_->World(), 0);
|
|
sizes[comm_.Rank()] = dh::ToSpan(data).size_bytes();
|
|
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
auto n_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
|
|
// create result
|
|
dh::device_vector<std::int32_t> result(n_bytes / sizeof(std::int32_t), -1);
|
|
auto s_result = common::EraseType(dh::ToSpan(result));
|
|
|
|
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
|
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
|
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
// check segment size
|
|
if (algo != AllgatherVAlgo::kBcast) {
|
|
auto size = recv_seg[nccl_comm_->Rank() + 1] - recv_seg[nccl_comm_->Rank()];
|
|
ASSERT_EQ(size, n * nccl_comm_->Rank() * sizeof(std::int32_t));
|
|
ASSERT_EQ(size, sizes[nccl_comm_->Rank()]);
|
|
}
|
|
// check data
|
|
std::size_t k{0};
|
|
for (std::int32_t r = 0; r < nccl_comm_->World(); ++r) {
|
|
std::size_t s = n * r;
|
|
auto current = dh::ToSpan(result).subspan(k, s);
|
|
std::vector<std::int32_t> h_data(current.size());
|
|
dh::CopyDeviceSpanToVector(&h_data, current);
|
|
for (auto v : h_data) {
|
|
ASSERT_EQ(v, r);
|
|
}
|
|
k += s;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
class MGPUAllgatherTest : public SocketTest {};
|
|
} // namespace
|
|
|
|
TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
|
|
auto n_workers = common::AllVisibleGPUs();
|
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
|
std::int32_t r) {
|
|
Worker w{host, port, timeout, n_workers, r};
|
|
w.Setup();
|
|
w.TestV(AllgatherVAlgo::kRing);
|
|
w.TestV(AllgatherVAlgo::kBcast);
|
|
});
|
|
}
|
|
|
|
TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
|
|
auto n_workers = common::AllVisibleGPUs();
|
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
|
std::int32_t r) {
|
|
Worker w{host, port, timeout, n_workers, r};
|
|
w.Setup();
|
|
w.TestV(AllgatherVAlgo::kBcast);
|
|
});
|
|
}
|
|
} // namespace xgboost::collective
|
|
#endif // defined(XGBOOST_USE_NCCL)
|