118 lines
4.5 KiB
Plaintext
118 lines
4.5 KiB
Plaintext
/**
|
|
* Copyright 2023, XGBoost Contributors
|
|
*/
|
|
#if defined(XGBOOST_USE_NCCL)
|
|
#include <gtest/gtest.h>
|
|
#include <thrust/device_vector.h> // for device_vector
|
|
#include <thrust/equal.h> // for equal
|
|
#include <xgboost/span.h> // for Span
|
|
|
|
#include <cstddef> // for size_t
|
|
#include <cstdint> // for int32_t, int64_t
|
|
#include <vector> // for vector
|
|
|
|
#include "../../../src/collective/allgather.h" // for RingAllgather
|
|
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
|
#include "../../../src/common/type.h" // for EraseType
|
|
#include "test_worker.cuh" // for NCCLWorkerForTest
|
|
#include "test_worker.h" // for TestDistributed, WorkerForTest
|
|
|
|
namespace xgboost::collective {
|
|
namespace {
|
|
class Worker : public NCCLWorkerForTest {
|
|
public:
|
|
using NCCLWorkerForTest::NCCLWorkerForTest;
|
|
|
|
void TestV(AllgatherVAlgo algo) {
|
|
{
|
|
// basic test
|
|
std::size_t n = 1;
|
|
// create data
|
|
dh::device_vector<std::int32_t> data(n, comm_.Rank());
|
|
auto s_data = common::EraseType(common::Span{data.data().get(), data.size()});
|
|
// get size
|
|
std::vector<std::int64_t> sizes(comm_.World(), -1);
|
|
sizes[comm_.Rank()] = s_data.size_bytes();
|
|
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
// create result
|
|
dh::device_vector<std::int32_t> result(comm_.World(), -1);
|
|
auto s_result = common::EraseType(dh::ToSpan(result));
|
|
|
|
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
|
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
|
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
|
|
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
|
ASSERT_EQ(result[i], i);
|
|
}
|
|
}
|
|
{
|
|
// V test
|
|
std::size_t n = 256 * 256;
|
|
// create data
|
|
dh::device_vector<std::int32_t> data(n * nccl_comm_->Rank(), nccl_comm_->Rank());
|
|
auto s_data = common::EraseType(common::Span{data.data().get(), data.size()});
|
|
// get size
|
|
std::vector<std::int64_t> sizes(nccl_comm_->World(), 0);
|
|
sizes[comm_.Rank()] = dh::ToSpan(data).size_bytes();
|
|
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
auto n_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
|
|
// create result
|
|
dh::device_vector<std::int32_t> result(n_bytes / sizeof(std::int32_t), -1);
|
|
auto s_result = common::EraseType(dh::ToSpan(result));
|
|
|
|
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
|
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
|
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
|
ASSERT_TRUE(rc.OK()) << rc.Report();
|
|
// check segment size
|
|
if (algo != AllgatherVAlgo::kBcast) {
|
|
auto size = recv_seg[nccl_comm_->Rank() + 1] - recv_seg[nccl_comm_->Rank()];
|
|
ASSERT_EQ(size, n * nccl_comm_->Rank() * sizeof(std::int32_t));
|
|
ASSERT_EQ(size, sizes[nccl_comm_->Rank()]);
|
|
}
|
|
// check data
|
|
std::size_t k{0};
|
|
for (std::int32_t r = 0; r < nccl_comm_->World(); ++r) {
|
|
std::size_t s = n * r;
|
|
auto current = dh::ToSpan(result).subspan(k, s);
|
|
std::vector<std::int32_t> h_data(current.size());
|
|
dh::CopyDeviceSpanToVector(&h_data, current);
|
|
for (auto v : h_data) {
|
|
ASSERT_EQ(v, r);
|
|
}
|
|
k += s;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
class AllgatherTestGPU : public SocketTest {};
|
|
} // namespace
|
|
|
|
TEST_F(AllgatherTestGPU, MGPUTestVRing) {
|
|
auto n_workers = common::AllVisibleGPUs();
|
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
|
std::int32_t r) {
|
|
Worker w{host, port, timeout, n_workers, r};
|
|
w.Setup();
|
|
w.TestV(AllgatherVAlgo::kRing);
|
|
w.TestV(AllgatherVAlgo::kBcast);
|
|
});
|
|
}
|
|
|
|
TEST_F(AllgatherTestGPU, MGPUTestVBcast) {
|
|
auto n_workers = common::AllVisibleGPUs();
|
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
|
std::int32_t r) {
|
|
Worker w{host, port, timeout, n_workers, r};
|
|
w.Setup();
|
|
w.TestV(AllgatherVAlgo::kBcast);
|
|
});
|
|
}
|
|
} // namespace xgboost::collective
|
|
#endif // defined(XGBOOST_USE_NCCL)
|