GPU performance logging/improvements (#3945)

- Improved GPU performance logging

- Only use one execute shards function

- Revert performance regression on multi-GPU

- Use threads to launch NCCL AllReduce
This commit is contained in:
Rory Mitchell
2018-11-29 14:36:51 +13:00
committed by GitHub
parent c5f92df475
commit a9d684db18
8 changed files with 127 additions and 102 deletions

View File

@@ -19,6 +19,7 @@
#include <sstream>
#include <string>
#include <vector>
#include "timer.h"
#ifdef XGBOOST_USE_NCCL
#include "nccl.h"
@@ -840,14 +841,17 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
*/
class AllReducer {
bool initialised;
bool initialised_;
bool debug_verbose_;
size_t allreduce_bytes_; // Keep statistics of the number of bytes communicated
size_t allreduce_calls_; // Keep statistics of the number of reduce calls
#ifdef XGBOOST_USE_NCCL
std::vector<ncclComm_t> comms;
std::vector<cudaStream_t> streams;
std::vector<int> device_ordinals;
#endif
public:
AllReducer() : initialised(false) {}
AllReducer() : initialised_(false),debug_verbose_(false) {}
/**
* \fn void Init(const std::vector<int> &device_ordinals)
@@ -858,8 +862,10 @@ class AllReducer {
* \param device_ordinals The device ordinals.
*/
void Init(const std::vector<int> &device_ordinals) {
void Init(const std::vector<int> &device_ordinals, bool debug_verbose) {
#ifdef XGBOOST_USE_NCCL
/** \brief this >monitor . init. */
this->debug_verbose_ = debug_verbose;
this->device_ordinals = device_ordinals;
comms.resize(device_ordinals.size());
dh::safe_nccl(ncclCommInitAll(comms.data(),
@@ -870,7 +876,7 @@ class AllReducer {
safe_cuda(cudaSetDevice(device_ordinals[i]));
safe_cuda(cudaStreamCreate(&streams[i]));
}
initialised = true;
initialised_ = true;
#else
CHECK_EQ(device_ordinals.size(), 1)
<< "XGBoost must be compiled with NCCL to use more than one GPU.";
@@ -878,7 +884,7 @@ class AllReducer {
}
~AllReducer() {
#ifdef XGBOOST_USE_NCCL
if (initialised) {
if (initialised_) {
for (auto &stream : streams) {
dh::safe_cuda(cudaStreamDestroy(stream));
}
@@ -886,6 +892,11 @@ class AllReducer {
ncclCommDestroy(comm);
}
}
if (debug_verbose_) {
LOG(CONSOLE) << "======== NCCL Statistics========";
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
LOG(CONSOLE) << "AllReduce total MB communicated: " << allreduce_bytes_/1000000;
}
#endif
}
@@ -920,11 +931,16 @@ class AllReducer {
void AllReduceSum(int communication_group_idx, const double *sendbuff,
double *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
CHECK(initialised);
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
comms.at(communication_group_idx),
streams.at(communication_group_idx)));
if(communication_group_idx == 0)
{
allreduce_bytes_ += count * sizeof(double);
allreduce_calls_ += 1;
}
#endif
}
@@ -942,7 +958,7 @@ class AllReducer {
void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
int64_t *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
CHECK(initialised);
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum,
@@ -989,27 +1005,6 @@ class SaveCudaContext {
}
};
/**
* \brief Executes some operation on each element of the input vector, using a
* single controlling thread for each element.
*
* \tparam T Generic type parameter.
* \tparam FunctionT Type of the function t.
* \param shards The shards.
* \param f The func_t to process.
*/
template <typename T, typename FunctionT>
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
SaveCudaContext {
[&](){
#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
for (int shard = 0; shard < shards->size(); ++shard) {
f(shards->at(shard));
}
}};
}
/**
* \brief Executes some operation on each element of the input vector, using a
* single controlling thread for each element. In addition, passes the shard index
@@ -1023,13 +1018,12 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {
template <typename T, typename FunctionT>
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
SaveCudaContext {
[&](){
SaveCudaContext{[&]() {
#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
for (int shard = 0; shard < shards->size(); ++shard) {
f(shard, shards->at(shard));
}
}};
for (int shard = 0; shard < shards->size(); ++shard) {
f(shard, shards->at(shard));
}
}};
}
/**