Merge branch 'master' into sync-condition-2023Oct11
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/collective/allgather.h" // for RingAllgather
|
||||
#include "../../../src/collective/coll.h" // for Coll
|
||||
#include "../../../src/collective/comm.h" // for RabitComm
|
||||
#include "gtest/gtest.h" // for AssertionR...
|
||||
#include "test_worker.h" // for TestDistri...
|
||||
@@ -63,37 +64,79 @@ class Worker : public WorkerForTest {
|
||||
}
|
||||
}
|
||||
|
||||
void TestV() {
|
||||
{
|
||||
// basic test
|
||||
std::int32_t n{comm_.Rank()};
|
||||
std::vector<std::int32_t> result;
|
||||
auto rc = RingAllgatherV(comm_, common::Span{&n, 1}, &result);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
||||
ASSERT_EQ(result[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// V test
|
||||
std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
|
||||
std::vector<std::int32_t> result;
|
||||
auto rc = RingAllgatherV(comm_, common::Span{data.data(), data.size()}, &result);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
ASSERT_EQ(result.size(), (1 + comm_.World()) * comm_.World() / 2);
|
||||
std::int32_t k{0};
|
||||
for (std::int32_t r = 0; r < comm_.World(); ++r) {
|
||||
auto seg = common::Span{result.data(), result.size()}.subspan(k, (r + 1));
|
||||
if (comm_.Rank() == 0) {
|
||||
for (auto v : seg) {
|
||||
ASSERT_EQ(v, r);
|
||||
}
|
||||
k += seg.size();
|
||||
void CheckV(common::Span<std::int32_t> result) {
|
||||
std::int32_t k{0};
|
||||
for (std::int32_t r = 0; r < comm_.World(); ++r) {
|
||||
auto seg = common::Span{result.data(), result.size()}.subspan(k, (r + 1));
|
||||
if (comm_.Rank() == 0) {
|
||||
for (auto v : seg) {
|
||||
ASSERT_EQ(v, r);
|
||||
}
|
||||
k += seg.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
void TestVRing() {
|
||||
// V test
|
||||
std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
|
||||
std::vector<std::int32_t> result;
|
||||
auto rc = RingAllgatherV(comm_, common::Span{data.data(), data.size()}, &result);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
ASSERT_EQ(result.size(), (1 + comm_.World()) * comm_.World() / 2);
|
||||
CheckV(result);
|
||||
}
|
||||
|
||||
void TestVBasic() {
|
||||
// basic test
|
||||
std::int32_t n{comm_.Rank()};
|
||||
std::vector<std::int32_t> result;
|
||||
auto rc = RingAllgatherV(comm_, common::Span{&n, 1}, &result);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
||||
ASSERT_EQ(result[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
void TestVAlgo() {
|
||||
// V test, broadcast
|
||||
std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
|
||||
auto s_data = common::Span{data.data(), data.size()};
|
||||
|
||||
std::vector<std::int64_t> sizes(comm_.World(), 0);
|
||||
sizes[comm_.Rank()] = s_data.size_bytes();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
std::shared_ptr<Coll> pcoll{new Coll{}};
|
||||
|
||||
std::vector<std::int64_t> recv_segments(comm_.World() + 1, 0);
|
||||
std::vector<std::int32_t> recv(std::accumulate(sizes.cbegin(), sizes.cend(), 0));
|
||||
|
||||
auto s_recv = common::Span{recv.data(), recv.size()};
|
||||
|
||||
rc = pcoll->AllgatherV(comm_, common::EraseType(s_data),
|
||||
common::Span{sizes.data(), sizes.size()},
|
||||
common::Span{recv_segments.data(), recv_segments.size()},
|
||||
common::EraseType(s_recv), AllgatherVAlgo::kBcast);
|
||||
ASSERT_TRUE(rc.OK());
|
||||
CheckV(s_recv);
|
||||
|
||||
// Test inplace
|
||||
auto test_inplace = [&] (AllgatherVAlgo algo) {
|
||||
std::fill_n(s_recv.data(), s_recv.size(), 0);
|
||||
auto current = s_recv.subspan(recv_segments[comm_.Rank()],
|
||||
recv_segments[comm_.Rank() + 1] - recv_segments[comm_.Rank()]);
|
||||
std::copy_n(data.data(), data.size(), current.data());
|
||||
rc = pcoll->AllgatherV(comm_, common::EraseType(current),
|
||||
common::Span{sizes.data(), sizes.size()},
|
||||
common::Span{recv_segments.data(), recv_segments.size()},
|
||||
common::EraseType(s_recv), algo);
|
||||
ASSERT_TRUE(rc.OK());
|
||||
CheckV(s_recv);
|
||||
};
|
||||
|
||||
test_inplace(AllgatherVAlgo::kBcast);
|
||||
test_inplace(AllgatherVAlgo::kRing);
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
@@ -106,12 +149,30 @@ TEST_F(AllgatherTest, Basic) {
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllgatherTest, V) {
|
||||
TEST_F(AllgatherTest, VBasic) {
|
||||
std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker worker{host, port, timeout, n_workers, r};
|
||||
worker.TestV();
|
||||
worker.TestVBasic();
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllgatherTest, VRing) {
|
||||
std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker worker{host, port, timeout, n_workers, r};
|
||||
worker.TestVRing();
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllgatherTest, VAlgo) {
|
||||
std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker worker{host, port, timeout, n_workers, r};
|
||||
worker.TestVAlgo();
|
||||
});
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
117
tests/cpp/collective/test_allgather.cu
Normal file
117
tests/cpp/collective/test_allgather.cu
Normal file
@@ -0,0 +1,117 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/device_vector.h> // for device_vector
|
||||
#include <thrust/equal.h> // for equal
|
||||
#include <xgboost/span.h> // for Span
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t, int64_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/collective/allgather.h" // for RingAllgather
|
||||
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "test_worker.cuh" // for NCCLWorkerForTest
|
||||
#include "test_worker.h" // for TestDistributed, WorkerForTest
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace {
|
||||
class Worker : public NCCLWorkerForTest {
|
||||
public:
|
||||
using NCCLWorkerForTest::NCCLWorkerForTest;
|
||||
|
||||
void TestV(AllgatherVAlgo algo) {
|
||||
{
|
||||
// basic test
|
||||
std::size_t n = 1;
|
||||
// create data
|
||||
dh::device_vector<std::int32_t> data(n, comm_.Rank());
|
||||
auto s_data = common::EraseType(common::Span{data.data().get(), data.size()});
|
||||
// get size
|
||||
std::vector<std::int64_t> sizes(comm_.World(), -1);
|
||||
sizes[comm_.Rank()] = s_data.size_bytes();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
// create result
|
||||
dh::device_vector<std::int32_t> result(comm_.World(), -1);
|
||||
auto s_result = common::EraseType(dh::ToSpan(result));
|
||||
|
||||
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
||||
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
||||
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
|
||||
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
||||
ASSERT_EQ(result[i], i);
|
||||
}
|
||||
}
|
||||
{
|
||||
// V test
|
||||
std::size_t n = 256 * 256;
|
||||
// create data
|
||||
dh::device_vector<std::int32_t> data(n * nccl_comm_->Rank(), nccl_comm_->Rank());
|
||||
auto s_data = common::EraseType(common::Span{data.data().get(), data.size()});
|
||||
// get size
|
||||
std::vector<std::int64_t> sizes(nccl_comm_->World(), 0);
|
||||
sizes[comm_.Rank()] = dh::ToSpan(data).size_bytes();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
auto n_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
|
||||
// create result
|
||||
dh::device_vector<std::int32_t> result(n_bytes / sizeof(std::int32_t), -1);
|
||||
auto s_result = common::EraseType(dh::ToSpan(result));
|
||||
|
||||
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
||||
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
||||
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
// check segment size
|
||||
if (algo != AllgatherVAlgo::kBcast) {
|
||||
auto size = recv_seg[nccl_comm_->Rank() + 1] - recv_seg[nccl_comm_->Rank()];
|
||||
ASSERT_EQ(size, n * nccl_comm_->Rank() * sizeof(std::int32_t));
|
||||
ASSERT_EQ(size, sizes[nccl_comm_->Rank()]);
|
||||
}
|
||||
// check data
|
||||
std::size_t k{0};
|
||||
for (std::int32_t r = 0; r < nccl_comm_->World(); ++r) {
|
||||
std::size_t s = n * r;
|
||||
auto current = dh::ToSpan(result).subspan(k, s);
|
||||
std::vector<std::int32_t> h_data(current.size());
|
||||
dh::CopyDeviceSpanToVector(&h_data, current);
|
||||
for (auto v : h_data) {
|
||||
ASSERT_EQ(v, r);
|
||||
}
|
||||
k += s;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class AllgatherTestGPU : public SocketTest {};
|
||||
} // namespace
|
||||
|
||||
TEST_F(AllgatherTestGPU, MGPUTestVRing) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
w.Setup();
|
||||
w.TestV(AllgatherVAlgo::kRing);
|
||||
w.TestV(AllgatherVAlgo::kBcast);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllgatherTestGPU, MGPUTestVBcast) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
w.Setup();
|
||||
w.TestV(AllgatherVAlgo::kBcast);
|
||||
});
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
#endif // defined(XGBOOST_USE_NCCL)
|
||||
@@ -6,10 +6,10 @@
|
||||
#include "../../../src/collective/allreduce.h"
|
||||
#include "../../../src/collective/coll.h" // for Coll
|
||||
#include "../../../src/collective/tracker.h"
|
||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||
|
||||
namespace xgboost::collective {
|
||||
|
||||
namespace {
|
||||
class AllreduceWorker : public WorkerForTest {
|
||||
public:
|
||||
@@ -50,11 +50,10 @@ class AllreduceWorker : public WorkerForTest {
|
||||
}
|
||||
|
||||
void BitOr() {
|
||||
Context ctx;
|
||||
std::vector<std::uint32_t> data(comm_.World(), 0);
|
||||
data[comm_.Rank()] = ~std::uint32_t{0};
|
||||
auto pcoll = std::shared_ptr<Coll>{new Coll{}};
|
||||
auto rc = pcoll->Allreduce(&ctx, comm_, EraseType(common::Span{data.data(), data.size()}),
|
||||
auto rc = pcoll->Allreduce(comm_, common::EraseType(common::Span{data.data(), data.size()}),
|
||||
ArrayInterfaceHandler::kU4, Op::kBitwiseOR);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
for (auto v : data) {
|
||||
|
||||
70
tests/cpp/collective/test_allreduce.cu
Normal file
70
tests/cpp/collective/test_allreduce.cu
Normal file
@@ -0,0 +1,70 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/host_vector.h> // for host_vector
|
||||
|
||||
#include "../../../src/collective/coll.h" // for Coll
|
||||
#include "../../../src/common/common.h"
|
||||
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "../helpers.h" // for MakeCUDACtx
|
||||
#include "test_worker.cuh" // for NCCLWorkerForTest
|
||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace {
|
||||
class AllreduceTestGPU : public SocketTest {};
|
||||
|
||||
class Worker : public NCCLWorkerForTest {
|
||||
public:
|
||||
using NCCLWorkerForTest::NCCLWorkerForTest;
|
||||
|
||||
void BitOr() {
|
||||
dh::device_vector<std::uint32_t> data(comm_.World(), 0);
|
||||
data[comm_.Rank()] = ~std::uint32_t{0};
|
||||
auto rc = nccl_coll_->Allreduce(*nccl_comm_, common::EraseType(dh::ToSpan(data)),
|
||||
ArrayInterfaceHandler::kU4, Op::kBitwiseOR);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
thrust::host_vector<std::uint32_t> h_data(data.size());
|
||||
thrust::copy(data.cbegin(), data.cend(), h_data.begin());
|
||||
for (auto v : h_data) {
|
||||
ASSERT_EQ(v, ~std::uint32_t{0});
|
||||
}
|
||||
}
|
||||
|
||||
void Acc() {
|
||||
dh::device_vector<double> data(314, 1.5);
|
||||
auto rc = nccl_coll_->Allreduce(*nccl_comm_, common::EraseType(dh::ToSpan(data)),
|
||||
ArrayInterfaceHandler::kF8, Op::kSum);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
for (std::size_t i = 0; i < data.size(); ++i) {
|
||||
auto v = data[i];
|
||||
ASSERT_EQ(v, 1.5 * static_cast<double>(comm_.World())) << i;
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(AllreduceTestGPU, BitOr) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
w.Setup();
|
||||
w.BitOr();
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(AllreduceTestGPU, Sum) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
w.Setup();
|
||||
w.Acc();
|
||||
});
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
#endif // defined(XGBOOST_USE_NCCL)
|
||||
@@ -47,5 +47,5 @@ TEST_F(BroadcastTest, Basic) {
|
||||
Worker worker{host, port, timeout, n_workers, r};
|
||||
worker.Run();
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
} // namespace xgboost::collective
|
||||
|
||||
32
tests/cpp/collective/test_worker.cuh
Normal file
32
tests/cpp/collective/test_worker.cuh
Normal file
@@ -0,0 +1,32 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <memory> // for shared_ptr
|
||||
|
||||
#include "../../../src/collective/coll.h" // for Coll
|
||||
#include "../../../src/collective/comm.h" // for Comm
|
||||
#include "test_worker.h"
|
||||
#include "xgboost/context.h" // for Context
|
||||
|
||||
namespace xgboost::collective {
|
||||
class NCCLWorkerForTest : public WorkerForTest {
|
||||
protected:
|
||||
std::shared_ptr<Coll> coll_;
|
||||
std::shared_ptr<xgboost::collective::Comm> nccl_comm_;
|
||||
std::shared_ptr<Coll> nccl_coll_;
|
||||
Context ctx_;
|
||||
|
||||
public:
|
||||
using WorkerForTest::WorkerForTest;
|
||||
|
||||
void Setup() {
|
||||
ctx_ = MakeCUDACtx(comm_.Rank());
|
||||
coll_.reset(new Coll{});
|
||||
nccl_comm_.reset(this->comm_.MakeCUDAVar(&ctx_, coll_));
|
||||
nccl_coll_.reset(coll_->MakeCUDAVar());
|
||||
ASSERT_EQ(comm_.World(), nccl_comm_->World());
|
||||
ASSERT_EQ(comm_.Rank(), nccl_comm_->Rank());
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::collective
|
||||
@@ -1,6 +1,7 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <chrono> // for seconds
|
||||
|
||||
Reference in New Issue
Block a user