Enable distributed GPU training over Rabit (#7930)
This commit is contained in:
@@ -274,6 +274,24 @@ template <typename Indexable>
|
||||
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
|
||||
return indptr[group + 1] - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief A CRTP (curiously recurring template pattern) helper function.
|
||||
*
|
||||
* https://www.fluentcpp.com/2017/05/19/crtp-helper/
|
||||
*
|
||||
* Does two things:
|
||||
* 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
|
||||
* 2. Avoids having to `static_cast` in a lot of places.
|
||||
*
|
||||
* @tparam T The derived class in a CRTP hierarchy.
|
||||
*/
|
||||
template <typename T>
|
||||
struct Crtp {
|
||||
T &Underlying() { return static_cast<T &>(*this); }
|
||||
T const &Underlying() const { return static_cast<T const &>(*this); }
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_COMMON_H_
|
||||
|
||||
@@ -30,19 +30,15 @@ std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
void AllReducer::Init(int _device_ordinal) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
device_ordinal_ = _device_ordinal;
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
|
||||
void NcclAllReducer::DoInit(int _device_ordinal) {
|
||||
int32_t const rank = rabit::GetRank();
|
||||
int32_t const world = rabit::GetWorldSize();
|
||||
|
||||
std::vector<uint64_t> uuids(world * kUuidLength, 0);
|
||||
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
||||
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
|
||||
GetCudaUUID(world, rank, device_ordinal_, s_this_uuid);
|
||||
GetCudaUUID(world, rank, _device_ordinal, s_this_uuid);
|
||||
|
||||
// No allgather yet.
|
||||
rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
|
||||
@@ -66,20 +62,11 @@ void AllReducer::Init(int _device_ordinal) {
|
||||
id_ = GetUniqueId();
|
||||
dh::safe_nccl(ncclCommInitRank(&comm_, rabit::GetWorldSize(), id_, rank));
|
||||
safe_cuda(cudaStreamCreate(&stream_));
|
||||
initialised_ = true;
|
||||
#else
|
||||
if (rabit::IsDistributed()) {
|
||||
LOG(FATAL) << "XGBoost is not compiled with NCCL.";
|
||||
}
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
}
|
||||
|
||||
void AllReducer::AllGather(void const *data, size_t length_bytes,
|
||||
std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
void NcclAllReducer::DoAllGather(void const *data, size_t length_bytes,
|
||||
std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
segments->clear();
|
||||
segments->resize(world, 0);
|
||||
@@ -98,11 +85,9 @@ void AllReducer::AllGather(void const *data, size_t length_bytes,
|
||||
offset += as_bytes;
|
||||
}
|
||||
safe_nccl(ncclGroupEnd());
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
}
|
||||
|
||||
AllReducer::~AllReducer() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
NcclAllReducer::~NcclAllReducer() {
|
||||
if (initialised_) {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream_));
|
||||
ncclCommDestroy(comm_);
|
||||
@@ -112,7 +97,41 @@ AllReducer::~AllReducer() {
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
|
||||
}
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
}
|
||||
#else
|
||||
void RabitAllReducer::DoInit(int _device_ordinal) {
|
||||
#if !defined(XGBOOST_USE_FEDERATED)
|
||||
if (rabit::IsDistributed()) {
|
||||
LOG(CONSOLE) << "XGBoost is not compiled with NCCL, falling back to Rabit.";
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void RabitAllReducer::DoAllGather(void const *data, size_t length_bytes,
|
||||
std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
segments->clear();
|
||||
segments->resize(world, 0);
|
||||
segments->at(rabit::GetRank()) = length_bytes;
|
||||
rabit::Allreduce<rabit::op::Max>(segments->data(), segments->size());
|
||||
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
|
||||
recvbuf->resize(total_bytes);
|
||||
|
||||
sendrecvbuf_.reserve(total_bytes);
|
||||
auto rank = rabit::GetRank();
|
||||
size_t offset = 0;
|
||||
for (int32_t i = 0; i < world; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
if (i == rank) {
|
||||
safe_cuda(
|
||||
cudaMemcpy(sendrecvbuf_.data() + offset, data, segments->at(rank), cudaMemcpyDefault));
|
||||
}
|
||||
rabit::Broadcast(sendrecvbuf_.data() + offset, as_bytes, i);
|
||||
offset += as_bytes;
|
||||
}
|
||||
safe_cuda(cudaMemcpy(recvbuf->data().get(), sendrecvbuf_.data(), total_bytes, cudaMemcpyDefault));
|
||||
}
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
|
||||
} // namespace dh
|
||||
|
||||
@@ -738,71 +738,55 @@ using TypedDiscard =
|
||||
* \class AllReducer
|
||||
*
|
||||
* \brief All reducer class that manages its own communication group and
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL
|
||||
* this is a dummy class that will error if used with more than one GPU.
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL,
|
||||
* this falls back to use Rabit.
|
||||
*/
|
||||
class AllReducer {
|
||||
bool initialised_ {false};
|
||||
size_t allreduce_bytes_ {0}; // Keep statistics of the number of bytes communicated
|
||||
size_t allreduce_calls_ {0}; // Keep statistics of the number of reduce calls
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
ncclComm_t comm_;
|
||||
cudaStream_t stream_;
|
||||
int device_ordinal_;
|
||||
ncclUniqueId id_;
|
||||
#endif
|
||||
|
||||
template <typename AllReducer>
|
||||
class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
|
||||
public:
|
||||
AllReducer() = default;
|
||||
virtual ~AllReducerBase() = default;
|
||||
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinal for this communication
|
||||
* group.
|
||||
* \brief Initialise with the desired device ordinal for this allreducer.
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
void Init(int _device_ordinal);
|
||||
|
||||
~AllReducer();
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
|
||||
void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
void Init(int _device_ordinal) {
|
||||
device_ordinal_ = _device_ordinal;
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm_, stream_));
|
||||
allreduce_bytes_ += count * sizeof(double);
|
||||
allreduce_calls_ += 1;
|
||||
#endif
|
||||
this->Underlying().DoInit(_device_ordinal);
|
||||
initialised_ = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* different size of data on different workers.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length_bytes Size of input data in bytes.
|
||||
* \param segments Size of data on each worker.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void AllGather(void const* data, size_t length_bytes,
|
||||
std::vector<size_t>* segments, dh::caching_device_vector<char>* recvbuf);
|
||||
|
||||
void AllGather(uint32_t const* data, size_t length,
|
||||
dh::caching_device_vector<uint32_t>* recvbuf) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
void AllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf) {
|
||||
CHECK(initialised_);
|
||||
size_t world = rabit::GetWorldSize();
|
||||
recvbuf->resize(length * world);
|
||||
safe_nccl(ncclAllGather(data, recvbuf->data().get(), length, ncclUint32,
|
||||
comm_, stream_));
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllGather(data, length_bytes, segments, recvbuf);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allgather. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length Size of input data in bytes.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void AllGather(uint32_t const *data, size_t length,
|
||||
dh::caching_device_vector<uint32_t> *recvbuf) {
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllGather(data, length, recvbuf);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -813,15 +797,28 @@ class AllReducer {
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
|
||||
void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm_, stream_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(double);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(float);
|
||||
allreduce_calls_ += 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -833,48 +830,68 @@ class AllReducer {
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
|
||||
void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, comm_, stream_));
|
||||
#endif
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(int64_t);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint32, ncclSum, comm_, stream_));
|
||||
#endif
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(uint32_t);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
|
||||
#endif
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(uint64_t);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
// Specialization for size_t, which is implementation defined so it might or might not
|
||||
// be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* Specialization for size_t, which is implementation defined so it might or might not
|
||||
* be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T = size_t,
|
||||
std::enable_if_t<std::is_same<size_t, T>::value &&
|
||||
!std::is_same<size_t, unsigned long long>::value> // NOLINT
|
||||
* = nullptr>
|
||||
void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
CHECK(initialised_);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
|
||||
#endif
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
|
||||
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
|
||||
allreduce_bytes_ += count * sizeof(T);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -883,13 +900,148 @@ class AllReducer {
|
||||
* \brief Synchronizes the entire communication group.
|
||||
*/
|
||||
void Synchronize() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_cuda(cudaStreamSynchronize(stream_));
|
||||
#endif
|
||||
};
|
||||
this->Underlying().DoSynchronize();
|
||||
}
|
||||
|
||||
protected:
|
||||
bool initialised_{false};
|
||||
size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated.
|
||||
size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls.
|
||||
|
||||
private:
|
||||
int device_ordinal_{-1};
|
||||
};
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
class NcclAllReducer : public AllReducerBase<NcclAllReducer> {
|
||||
public:
|
||||
friend class AllReducerBase<NcclAllReducer>;
|
||||
|
||||
~NcclAllReducer() override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinal for this communication
|
||||
* group.
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
void DoInit(int _device_ordinal);
|
||||
|
||||
/**
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* different size of data on different workers.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length_bytes Size of input data in bytes.
|
||||
* \param segments Size of data on each worker.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf);
|
||||
|
||||
/**
|
||||
* \brief Allgather. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length Size of input data in bytes.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(uint32_t const *data, size_t length,
|
||||
dh::caching_device_vector<uint32_t> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
recvbuf->resize(length * world);
|
||||
safe_nccl(ncclAllGather(data, recvbuf->data().get(), length, ncclUint32, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
|
||||
*
|
||||
* \param count Number of.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
void DoAllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint32, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* Specialization for size_t, which is implementation defined so it might or might not
|
||||
* be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T = size_t,
|
||||
std::enable_if_t<std::is_same<size_t, T>::value &&
|
||||
!std::is_same<size_t, unsigned long long>::value> // NOLINT
|
||||
* = nullptr>
|
||||
void DoAllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Synchronizes the entire communication group.
|
||||
*/
|
||||
void DoSynchronize() { dh::safe_cuda(cudaStreamSynchronize(stream_)); }
|
||||
|
||||
/**
|
||||
* \fn ncclUniqueId GetUniqueId()
|
||||
*
|
||||
@@ -904,15 +1056,163 @@ class AllReducer {
|
||||
if (rabit::GetRank() == kRootRank) {
|
||||
dh::safe_nccl(ncclGetUniqueId(&id));
|
||||
}
|
||||
rabit::Broadcast(
|
||||
static_cast<void*>(&id),
|
||||
sizeof(ncclUniqueId),
|
||||
static_cast<int>(kRootRank));
|
||||
rabit::Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
|
||||
return id;
|
||||
}
|
||||
#endif
|
||||
|
||||
ncclComm_t comm_;
|
||||
cudaStream_t stream_;
|
||||
ncclUniqueId id_;
|
||||
};
|
||||
|
||||
using AllReducer = NcclAllReducer;
|
||||
#else
|
||||
class RabitAllReducer : public AllReducerBase<RabitAllReducer> {
|
||||
public:
|
||||
friend class AllReducerBase<RabitAllReducer>;
|
||||
|
||||
private:
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinal for this allreducer.
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
static void DoInit(int _device_ordinal);
|
||||
|
||||
/**
|
||||
* \brief Allgather implemented as grouped calls to Broadcast. This way we can accept
|
||||
* different size of data on different workers.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length_bytes Size of input data in bytes.
|
||||
* \param segments Size of data on each worker.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
|
||||
dh::caching_device_vector<char> *recvbuf);
|
||||
|
||||
/**
|
||||
* \brief Allgather. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param data Buffer storing the input data.
|
||||
* \param length Size of input data in bytes.
|
||||
* \param recvbuf Buffer storing the result of data from all workers.
|
||||
*/
|
||||
void DoAllGather(uint32_t *data, size_t length, dh::caching_device_vector<uint32_t> *recvbuf) {
|
||||
size_t world = rabit::GetWorldSize();
|
||||
auto total_size = length * world;
|
||||
recvbuf->resize(total_size);
|
||||
sendrecvbuf_.reserve(total_size);
|
||||
auto rank = rabit::GetRank();
|
||||
safe_cuda(cudaMemcpy(sendrecvbuf_.data() + rank * length, data, length, cudaMemcpyDefault));
|
||||
rabit::Allgather(sendrecvbuf_.data(), total_size, rank * length, length, length);
|
||||
safe_cuda(cudaMemcpy(data, sendrecvbuf_.data(), total_size, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void DoAllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* Specialization for size_t, which is implementation defined so it might or might not
|
||||
* be one of uint64_t/uint32_t/unsigned long long/unsigned long.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T = size_t,
|
||||
std::enable_if_t<std::is_same<size_t, T>::value &&
|
||||
!std::is_same<size_t, unsigned long long>::value> // NOLINT
|
||||
* = nullptr>
|
||||
void DoAllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
|
||||
RabitAllReduceSum(sendbuff, recvbuff, count);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Synchronizes the allreducer.
|
||||
*/
|
||||
void DoSynchronize() {}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL.
|
||||
*
|
||||
* Copy the device buffer to host, call rabit allreduce, then copy the buffer back
|
||||
* to device.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
template <typename T>
|
||||
void RabitAllReduceSum(const T *sendbuff, T *recvbuff, int count) {
|
||||
auto total_size = count * sizeof(T);
|
||||
sendrecvbuf_.reserve(total_size);
|
||||
safe_cuda(cudaMemcpy(sendrecvbuf_.data(), sendbuff, total_size, cudaMemcpyDefault));
|
||||
rabit::Allreduce<rabit::op::Sum>(reinterpret_cast<T*>(sendrecvbuf_.data()), count);
|
||||
safe_cuda(cudaMemcpy(recvbuff, sendrecvbuf_.data(), total_size, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/// Host buffer used to call rabit functions.
|
||||
std::vector<char> sendrecvbuf_{};
|
||||
};
|
||||
|
||||
using AllReducer = RabitAllReducer;
|
||||
#endif
|
||||
|
||||
template <typename VectorT, typename T = typename VectorT::value_type,
|
||||
typename IndexT = typename xgboost::common::Span<T>::index_type>
|
||||
xgboost::common::Span<T> ToSpan(
|
||||
|
||||
Reference in New Issue
Block a user