[backport] Update CUDA docker image and NCCL. (#8139) (#8162)

* Update CUDA docker image and NCCL. (#8139)

* Rest of the CI.

* CPU test dependencies.
This commit is contained in:
Jiaming Yuan
2022-08-12 18:57:42 +08:00
committed by GitHub
parent a55d3bdde2
commit 39c1488a42
13 changed files with 68 additions and 45 deletions

View File

@@ -38,6 +38,9 @@ void AllReducer::Init(int _device_ordinal) {
int32_t const rank = rabit::GetRank();
int32_t const world = rabit::GetWorldSize();
if (world == 1) {
return;
}
std::vector<uint64_t> uuids(world * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};

View File

@@ -775,13 +775,16 @@ class AllReducer {
*/
void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
#ifdef XGBOOST_USE_NCCL
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm_, stream_));
allreduce_bytes_ += count * sizeof(double);
allreduce_calls_ += 1;
#endif
#endif // XGBOOST_USE_NCCL
}
/**
@@ -796,9 +799,12 @@ class AllReducer {
void AllGather(uint32_t const* data, size_t length,
dh::caching_device_vector<uint32_t>* recvbuf) {
size_t world = rabit::GetWorldSize();
if (world == 1) {
return;
}
#ifdef XGBOOST_USE_NCCL
CHECK(initialised_);
size_t world = rabit::GetWorldSize();
recvbuf->resize(length * world);
safe_nccl(ncclAllGather(data, recvbuf->data().get(), length, ncclUint32,
comm_, stream_));
@@ -813,9 +819,11 @@ class AllReducer {
* \param recvbuff The recvbuff.
* \param count Number of elements.
*/
void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm_, stream_));
@@ -836,6 +844,9 @@ class AllReducer {
void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_));
@@ -845,6 +856,9 @@ class AllReducer {
void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_));
@@ -853,6 +867,9 @@ class AllReducer {
}
void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
#ifdef XGBOOST_USE_NCCL
CHECK(initialised_);
@@ -867,12 +884,15 @@ class AllReducer {
std::enable_if_t<std::is_same<size_t, T>::value &&
!std::is_same<size_t, unsigned long long>::value> // NOLINT
* = nullptr>
void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
#ifdef XGBOOST_USE_NCCL
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
#endif
}