[Breaking] Switch from rabit to the collective communicator (#8257)
* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -1,139 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2017-2019 XGBoost contributors
|
||||
*
|
||||
* \brief Utilities for CUDA.
|
||||
*/
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#include <nccl.h>
|
||||
#endif // #ifdef XGBOOST_USE_NCCL
|
||||
#include <sstream>
|
||||
|
||||
#include "device_helpers.cuh"
|
||||
|
||||
namespace dh {
|
||||
|
||||
constexpr std::size_t kUuidLength =
|
||||
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
|
||||
|
||||
void GetCudaUUID(int device_ord, xgboost::common::Span<uint64_t, kUuidLength> uuid) {
|
||||
cudaDeviceProp prob;
|
||||
safe_cuda(cudaGetDeviceProperties(&prob, device_ord));
|
||||
std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
|
||||
}
|
||||
|
||||
std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
|
||||
std::stringstream ss;
|
||||
for (auto v : uuid) {
|
||||
ss << std::hex << v;
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
void NcclAllReducer::DoInit(int _device_ordinal) {
|
||||
int32_t const rank = rabit::GetRank();
|
||||
int32_t const world = rabit::GetWorldSize();
|
||||
if (world == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> uuids(world * kUuidLength, 0);
|
||||
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
||||
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
|
||||
GetCudaUUID(_device_ordinal, s_this_uuid);
|
||||
|
||||
// No allgather yet.
|
||||
rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
|
||||
|
||||
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);;
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
|
||||
converted[j] =
|
||||
xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
|
||||
j++;
|
||||
}
|
||||
|
||||
auto iter = std::unique(converted.begin(), converted.end());
|
||||
auto n_uniques = std::distance(converted.begin(), iter);
|
||||
|
||||
CHECK_EQ(n_uniques, world)
|
||||
<< "Multiple processes within communication group running on same CUDA "
|
||||
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
|
||||
|
||||
|
||||
id_ = GetUniqueId();
|
||||
dh::safe_nccl(ncclCommInitRank(&comm_, rabit::GetWorldSize(), id_, rank));
|
||||
safe_cuda(cudaStreamCreate(&stream_));
|
||||
}
|
||||
|
||||
void NcclAllReducer::DoAllGather(void const *data, size_t length_bytes,
|
||||
std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
int32_t world = rabit::GetWorldSize();
|
||||
segments->clear();
|
||||
segments->resize(world, 0);
|
||||
segments->at(rabit::GetRank()) = length_bytes;
|
||||
rabit::Allreduce<rabit::op::Max>(segments->data(), segments->size());
|
||||
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0);
|
||||
recvbuf->resize(total_bytes);
|
||||
|
||||
size_t offset = 0;
|
||||
safe_nccl(ncclGroupStart());
|
||||
for (int32_t i = 0; i < world; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
safe_nccl(
|
||||
ncclBroadcast(data, recvbuf->data().get() + offset,
|
||||
as_bytes, ncclChar, i, comm_, stream_));
|
||||
offset += as_bytes;
|
||||
}
|
||||
safe_nccl(ncclGroupEnd());
|
||||
}
|
||||
|
||||
NcclAllReducer::~NcclAllReducer() {
|
||||
if (initialised_) {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream_));
|
||||
ncclCommDestroy(comm_);
|
||||
}
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
|
||||
}
|
||||
}
|
||||
#else
|
||||
void RabitAllReducer::DoInit(int _device_ordinal) {
|
||||
#if !defined(XGBOOST_USE_FEDERATED)
|
||||
if (rabit::IsDistributed()) {
|
||||
LOG(CONSOLE) << "XGBoost is not compiled with NCCL, falling back to Rabit.";
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void RabitAllReducer::DoAllGather(void const *data, size_t length_bytes,
|
||||
std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
segments->clear();
|
||||
segments->resize(world, 0);
|
||||
segments->at(rabit::GetRank()) = length_bytes;
|
||||
rabit::Allreduce<rabit::op::Max>(segments->data(), segments->size());
|
||||
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
|
||||
recvbuf->resize(total_bytes);
|
||||
|
||||
sendrecvbuf_.reserve(total_bytes);
|
||||
auto rank = rabit::GetRank();
|
||||
size_t offset = 0;
|
||||
for (int32_t i = 0; i < world; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
if (i == rank) {
|
||||
safe_cuda(
|
||||
cudaMemcpy(sendrecvbuf_.data() + offset, data, segments->at(rank), cudaMemcpyDefault));
|
||||
}
|
||||
rabit::Broadcast(sendrecvbuf_.data() + offset, as_bytes, i);
|
||||
offset += as_bytes;
|
||||
}
|
||||
safe_cuda(cudaMemcpy(recvbuf->data().get(), sendrecvbuf_.data(), total_bytes, cudaMemcpyDefault));
|
||||
}
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
|
||||
} // namespace dh
|
||||
@@ -19,7 +19,6 @@
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/binary_search.h>
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_allocator.cuh>
|
||||
|
||||
@@ -36,6 +35,7 @@
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/global_config.h"
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "common.h"
|
||||
#include "algorithm.cuh"
|
||||
|
||||
@@ -404,7 +404,7 @@ inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||
// dh::DebugSyncDevice(__FILE__, __LINE__);
|
||||
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
||||
if (file != "" && line != -1) {
|
||||
auto rank = rabit::GetRank();
|
||||
auto rank = xgboost::collective::GetRank();
|
||||
LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line;
|
||||
}
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
@@ -423,7 +423,7 @@ using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
||||
|
||||
inline void ThrowOOMError(std::string const& err, size_t bytes) {
|
||||
auto device = CurrentDevice();
|
||||
auto rank = rabit::GetRank();
|
||||
auto rank = xgboost::collective::GetRank();
|
||||
std::stringstream ss;
|
||||
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
|
||||
<< "- Free memory: " << AvailableMemory(device) << "\n"
|
||||
@@ -737,512 +737,6 @@ using TypedDiscard =
|
||||
std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
|
||||
detail::TypedDiscard<T>>;
|
||||
|
||||
/**
|
||||
* \class AllReducer
|
||||
*
|
||||
* \brief All reducer class that manages its own communication group and
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL,
|
||||
* this falls back to use Rabit.
|
||||
*/
|
||||
template <typename AllReducer>
|
||||
class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
|
||||
public:
|
||||
virtual ~AllReducerBase() = default;
|
||||
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinal for this allreducer.
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
void Init(int _device_ordinal) {
|
||||
device_ordinal_ = _device_ordinal;
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
this->Underlying().DoInit(_device_ordinal);
|
||||
initialised_ = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* different size of data on different workers.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length_bytes Size of input data in bytes.
|
||||
* \param segments Size of data on each worker.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void AllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllGather(data, length_bytes, segments, recvbuf);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allgather. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length Size of input data in bytes.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void AllGather(uint32_t const *data, size_t length,
|
||||
dh::caching_device_vector<uint32_t> *recvbuf) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllGather(data, length, recvbuf);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(double);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(float);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
|
||||
*
|
||||
* \param count Number of.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(int64_t);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(uint32_t);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(uint64_t);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* Specialization for size_t, which is implementation defined so it might or might not
|
||||
* be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T = size_t,
|
||||
std::enable_if_t<std::is_same<size_t, T>::value &&
|
||||
!std::is_same<size_t, unsigned long long>::value> // NOLINT
|
||||
* = nullptr>
|
||||
void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
if (rabit::GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(T);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void Synchronize()
|
||||
*
|
||||
* \brief Synchronizes the entire communication group.
|
||||
*/
|
||||
void Synchronize() {
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoSynchronize();
|
||||
}
|
||||
|
||||
protected:
|
||||
bool initialised_{false};
|
||||
size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated.
|
||||
size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls.
|
||||
|
||||
private:
|
||||
int device_ordinal_{-1};
|
||||
};
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
class NcclAllReducer : public AllReducerBase<NcclAllReducer> {
|
||||
public:
|
||||
friend class AllReducerBase<NcclAllReducer>;
|
||||
|
||||
~NcclAllReducer() override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinal for this communication
|
||||
* group.
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
void DoInit(int _device_ordinal);
|
||||
|
||||
/**
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* different size of data on different workers.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length_bytes Size of input data in bytes.
|
||||
* \param segments Size of data on each worker.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf);
|
||||
|
||||
/**
|
||||
* \brief Allgather. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length Size of input data in bytes.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(uint32_t const *data, size_t length,
|
||||
dh::caching_device_vector<uint32_t> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
recvbuf->resize(length * world);
|
||||
safe_nccl(ncclAllGather(data, recvbuf->data().get(), length, ncclUint32, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
|
||||
*
|
||||
* \param count Number of.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
void DoAllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint32, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* Specialization for size_t, which is implementation defined so it might or might not
|
||||
* be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T = size_t,
|
||||
std::enable_if_t<std::is_same<size_t, T>::value &&
|
||||
!std::is_same<size_t, unsigned long long>::value> // NOLINT
|
||||
* = nullptr>
|
||||
void DoAllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Synchronizes the entire communication group.
|
||||
*/
|
||||
void DoSynchronize() { dh::safe_cuda(cudaStreamSynchronize(stream_)); }
|
||||
|
||||
/**
|
||||
* \fn ncclUniqueId GetUniqueId()
|
||||
*
|
||||
* \brief Gets the Unique ID from NCCL to be used in setting up interprocess
|
||||
* communication
|
||||
*
|
||||
* \return the Unique ID
|
||||
*/
|
||||
ncclUniqueId GetUniqueId() {
|
||||
static const int kRootRank = 0;
|
||||
ncclUniqueId id;
|
||||
if (rabit::GetRank() == kRootRank) {
|
||||
dh::safe_nccl(ncclGetUniqueId(&id));
|
||||
}
|
||||
rabit::Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
|
||||
return id;
|
||||
}
|
||||
|
||||
ncclComm_t comm_;
|
||||
cudaStream_t stream_;
|
||||
ncclUniqueId id_;
|
||||
};
|
||||
|
||||
using AllReducer = NcclAllReducer;
|
||||
#else
|
||||
class RabitAllReducer : public AllReducerBase<RabitAllReducer> {
|
||||
public:
|
||||
friend class AllReducerBase<RabitAllReducer>;
|
||||
|
||||
private:
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinal for this allreducer.
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
static void DoInit(int _device_ordinal);
|
||||
|
||||
/**
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* different size of data on different workers.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length_bytes Size of input data in bytes.
|
||||
* \param segments Size of data on each worker.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf);
|
||||
|
||||
/**
|
||||
* \brief Allgather. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length Size of input data in bytes.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(uint32_t *data, size_t length, dh::caching_device_vector<uint32_t> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
auto total_size = length * world;
|
||||
recvbuf->resize(total_size);
|
||||
sendrecvbuf_.reserve(total_size);
|
||||
auto rank = rabit::GetRank();
|
||||
safe_cuda(cudaMemcpy(sendrecvbuf_.data() + rank * length, data, length, cudaMemcpyDefault));
|
||||
rabit::Allgather(sendrecvbuf_.data(), total_size, rank * length, length, length);
|
||||
safe_cuda(cudaMemcpy(data, sendrecvbuf_.data(), total_size, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* Specialization for size_t, which is implementation defined so it might or might not
|
||||
* be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T = size_t,
|
||||
std::enable_if_t<std::is_same<size_t, T>::value &&
|
||||
!std::is_same<size_t, unsigned long long>::value> // NOLINT
|
||||
* = nullptr>
|
||||
void DoAllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Synchronizes the allreducer.
|
||||
*/
|
||||
void DoSynchronize() {}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* Copy the device buffer to host, call rabit allreduce, then copy the buffer back
|
||||
* to device.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T>
|
||||
void RabitAllReduceSum(const T *sendbuff, T *recvbuff, int count) {
|
||||
auto total_size = count * sizeof(T);
|
||||
sendrecvbuf_.reserve(total_size);
|
||||
safe_cuda(cudaMemcpy(sendrecvbuf_.data(), sendbuff, total_size, cudaMemcpyDefault));
|
||||
rabit::Allreduce<rabit::op::Sum>(reinterpret_cast<T*>(sendrecvbuf_.data()), count);
|
||||
safe_cuda(cudaMemcpy(recvbuff, sendrecvbuf_.data(), total_size, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/// Host buffer used to call rabit functions.
|
||||
std::vector<char> sendrecvbuf_{};
|
||||
};
|
||||
|
||||
using AllReducer = RabitAllReducer;
|
||||
#endif
|
||||
|
||||
template <typename VectorT, typename T = typename VectorT::value_type,
|
||||
typename IndexT = typename xgboost::common::Span<T>::index_type>
|
||||
xgboost::common::Span<T> ToSpan(
|
||||
|
||||
@@ -3,19 +3,14 @@
|
||||
* \file hist_util.cc
|
||||
*/
|
||||
#include <dmlc/timer.h>
|
||||
#include <dmlc/omp.h>
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/base.h"
|
||||
#include "../common/common.h"
|
||||
#include "hist_util.h"
|
||||
#include "random.h"
|
||||
#include "column_matrix.h"
|
||||
#include "quantile.h"
|
||||
#include "../data/gradient_index.h"
|
||||
|
||||
#if defined(XGBOOST_MM_PREFETCH_PRESENT)
|
||||
#include <xmmintrin.h>
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../data/adapter.h"
|
||||
#include "categorical.h"
|
||||
#include "hist_util.h"
|
||||
#include "rabit/rabit.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -144,8 +144,8 @@ struct QuantileAllreduce {
|
||||
void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_threads,
|
||||
std::vector<std::set<float>> *p_categories) {
|
||||
auto &categories = *p_categories;
|
||||
auto world_size = rabit::GetWorldSize();
|
||||
auto rank = rabit::GetRank();
|
||||
auto world_size = collective::GetWorldSize();
|
||||
auto rank = collective::GetRank();
|
||||
if (world_size == 1) {
|
||||
return;
|
||||
}
|
||||
@@ -163,7 +163,8 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
|
||||
size_t feat_begin = rank * feature_ptr.size(); // pointer to current worker
|
||||
std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
|
||||
rabit::Allreduce<rabit::op::Sum>(global_feat_ptrs.data(), global_feat_ptrs.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(global_feat_ptrs.data(),
|
||||
global_feat_ptrs.size());
|
||||
|
||||
// move all categories into a flatten vector to prepare for allreduce
|
||||
size_t total = feature_ptr.back();
|
||||
@@ -176,7 +177,8 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
// indptr for indexing workers
|
||||
std::vector<size_t> global_worker_ptr(world_size + 1, 0);
|
||||
global_worker_ptr[rank + 1] = total; // shift 1 to right for constructing the indptr
|
||||
rabit::Allreduce<rabit::op::Sum>(global_worker_ptr.data(), global_worker_ptr.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(global_worker_ptr.data(),
|
||||
global_worker_ptr.size());
|
||||
std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
|
||||
// total number of categories in all workers with all features
|
||||
auto gtotal = global_worker_ptr.back();
|
||||
@@ -188,7 +190,8 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
CHECK_EQ(rank_size, total);
|
||||
std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
|
||||
// gather values from all workers.
|
||||
rabit::Allreduce<rabit::op::Sum>(global_categories.data(), global_categories.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(global_categories.data(),
|
||||
global_categories.size());
|
||||
QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
|
||||
categories.size()};
|
||||
ParallelFor(categories.size(), n_threads, [&](auto fidx) {
|
||||
@@ -217,8 +220,8 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
|
||||
std::vector<typename WQSketch::Entry> *p_global_sketches) {
|
||||
auto &worker_segments = *p_worker_segments;
|
||||
worker_segments.resize(1, 0);
|
||||
auto world = rabit::GetWorldSize();
|
||||
auto rank = rabit::GetRank();
|
||||
auto world = collective::GetWorldSize();
|
||||
auto rank = collective::GetRank();
|
||||
auto n_columns = sketches_.size();
|
||||
|
||||
// get the size of each feature.
|
||||
@@ -237,7 +240,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
|
||||
std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);
|
||||
|
||||
// Gather all column pointers
|
||||
rabit::Allreduce<rabit::op::Sum>(sketches_scan.data(), sketches_scan.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(sketches_scan.data(), sketches_scan.size());
|
||||
for (int32_t i = 0; i < world; ++i) {
|
||||
size_t back = (i + 1) * (n_columns + 1) - 1;
|
||||
auto n_entries = sketches_scan.at(back);
|
||||
@@ -265,7 +268,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
|
||||
|
||||
static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
|
||||
"Unexpected size of sketch entry.");
|
||||
rabit::Allreduce<rabit::op::Sum>(
|
||||
collective::Allreduce<collective::Operation::kSum>(
|
||||
reinterpret_cast<float *>(global_sketches.data()),
|
||||
global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
|
||||
}
|
||||
@@ -277,7 +280,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
|
||||
monitor_.Start(__func__);
|
||||
|
||||
size_t n_columns = sketches_.size();
|
||||
rabit::Allreduce<rabit::op::Max>(&n_columns, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
|
||||
CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
|
||||
|
||||
AllreduceCategories(feature_types_, n_threads_, &categories_);
|
||||
@@ -291,7 +294,8 @@ void SketchContainerImpl<WQSketch>::AllReduce(
|
||||
|
||||
// Prune the intermediate num cuts for synchronization.
|
||||
std::vector<bst_row_t> global_column_size(columns_size_);
|
||||
rabit::Allreduce<rabit::op::Sum>(global_column_size.data(), global_column_size.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(global_column_size.data(),
|
||||
global_column_size.size());
|
||||
|
||||
ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
|
||||
int32_t intermediate_num_cuts = static_cast<int32_t>(
|
||||
@@ -311,7 +315,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
|
||||
num_cuts[i] = intermediate_num_cuts;
|
||||
});
|
||||
|
||||
auto world = rabit::GetWorldSize();
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world == 1) {
|
||||
monitor_.Stop(__func__);
|
||||
return;
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "../collective/communicator.h"
|
||||
#include "../collective/device_communicator.cuh"
|
||||
#include "categorical.h"
|
||||
#include "common.h"
|
||||
#include "device_helpers.cuh"
|
||||
@@ -501,47 +503,41 @@ void SketchContainer::FixError() {
|
||||
|
||||
void SketchContainer::AllReduce() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
auto world = rabit::GetWorldSize();
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
timer_.Start(__func__);
|
||||
if (!reducer_) {
|
||||
reducer_ = std::make_shared<dh::AllReducer>();
|
||||
reducer_->Init(device_);
|
||||
}
|
||||
auto* communicator = collective::Communicator::GetDevice(device_);
|
||||
// Reduce the overhead on syncing.
|
||||
size_t global_sum_rows = num_rows_;
|
||||
rabit::Allreduce<rabit::op::Sum>(&global_sum_rows, 1);
|
||||
collective::Allreduce<collective::Operation::kSum>(&global_sum_rows, 1);
|
||||
size_t intermediate_num_cuts =
|
||||
std::min(global_sum_rows, static_cast<size_t>(num_bins_ * kFactor));
|
||||
this->Prune(intermediate_num_cuts);
|
||||
|
||||
|
||||
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
||||
CHECK_EQ(d_columns_ptr.size(), num_columns_ + 1);
|
||||
size_t n = d_columns_ptr.size();
|
||||
rabit::Allreduce<rabit::op::Max>(&n, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&n, 1);
|
||||
CHECK_EQ(n, d_columns_ptr.size()) << "Number of columns differs across workers";
|
||||
|
||||
// Get the columns ptr from all workers
|
||||
dh::device_vector<SketchContainer::OffsetT> gathered_ptrs;
|
||||
gathered_ptrs.resize(d_columns_ptr.size() * world, 0);
|
||||
size_t rank = rabit::GetRank();
|
||||
size_t rank = collective::GetRank();
|
||||
auto offset = rank * d_columns_ptr.size();
|
||||
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
|
||||
gathered_ptrs.begin() + offset);
|
||||
reducer_->AllReduceSum(gathered_ptrs.data().get(), gathered_ptrs.data().get(),
|
||||
gathered_ptrs.size());
|
||||
communicator->AllReduceSum(gathered_ptrs.data().get(), gathered_ptrs.size());
|
||||
|
||||
// Get the data from all workers.
|
||||
std::vector<size_t> recv_lengths;
|
||||
dh::caching_device_vector<char> recvbuf;
|
||||
reducer_->AllGather(this->Current().data().get(),
|
||||
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths,
|
||||
&recvbuf);
|
||||
reducer_->Synchronize();
|
||||
communicator->AllGatherV(this->Current().data().get(), dh::ToSpan(this->Current()).size_bytes(),
|
||||
&recv_lengths, &recvbuf);
|
||||
communicator->Synchronize();
|
||||
|
||||
// Segment the received data.
|
||||
auto s_recvbuf = dh::ToSpan(recvbuf);
|
||||
|
||||
@@ -37,7 +37,6 @@ class SketchContainer {
|
||||
|
||||
private:
|
||||
Monitor timer_;
|
||||
std::shared_ptr<dh::AllReducer> reducer_;
|
||||
HostDeviceVector<FeatureType> feature_types_;
|
||||
bst_row_t num_rows_;
|
||||
bst_feature_t num_columns_;
|
||||
@@ -93,15 +92,12 @@ class SketchContainer {
|
||||
* \param num_columns Total number of columns in dataset.
|
||||
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
||||
* \param device GPU ID.
|
||||
* \param reducer Optional initialised reducer. Useful for speeding up testing.
|
||||
*/
|
||||
SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
|
||||
int32_t max_bin, bst_feature_t num_columns,
|
||||
bst_row_t num_rows, int32_t device,
|
||||
std::shared_ptr<dh::AllReducer> reducer = nullptr)
|
||||
bst_row_t num_rows, int32_t device)
|
||||
: num_rows_{num_rows},
|
||||
num_columns_{num_columns}, num_bins_{max_bin}, device_{device},
|
||||
reducer_(std::move(reducer)) {
|
||||
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
||||
CHECK_GE(device, 0);
|
||||
// Initialize Sketches for this dmatrix
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
|
||||
@@ -7,20 +7,21 @@
|
||||
#ifndef XGBOOST_COMMON_RANDOM_H_
|
||||
#define XGBOOST_COMMON_RANDOM_H_
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "common.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -143,7 +144,7 @@ class ColumnSampler {
|
||||
*/
|
||||
ColumnSampler() {
|
||||
uint32_t seed = common::GlobalRandom()();
|
||||
rabit::Broadcast(&seed, sizeof(seed), 0);
|
||||
collective::Broadcast(&seed, sizeof(seed), 0);
|
||||
rng_.seed(seed);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
/*!
|
||||
* Copyright by Contributors 2019
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include "timer.h"
|
||||
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#include <nvToolsExt.h>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
@@ -54,7 +53,7 @@ void Monitor::PrintStatistics(StatMap const& statistics) const {
|
||||
|
||||
void Monitor::Print() const {
|
||||
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
|
||||
auto rank = rabit::GetRank();
|
||||
auto rank = collective::GetRank();
|
||||
StatMap stat_map;
|
||||
for (auto const &kv : statistics_map_) {
|
||||
stat_map[kv.first] = std::make_pair(
|
||||
|
||||
Reference in New Issue
Block a user