Combine thread launches into single launch per tree for gpu_hist (#4343)

* Combine thread launches into single launch per tree for gpu_hist
algorithm.

* Address deprecation warning

* Add manual column sampler constructor

* Turn off omp dynamic to get a guaranteed number of threads

* Enable openmp in cuda code
This commit is contained in:
Rory Mitchell
2019-04-29 09:58:34 +12:00
committed by GitHub
parent 146e83f3b3
commit 5e582b0fa7
10 changed files with 402 additions and 325 deletions

View File

@@ -12,6 +12,7 @@
#include "span.h"
#include <algorithm>
#include <omp.h>
#include <chrono>
#include <ctime>
#include <cub/cub.cuh>
@@ -752,6 +753,29 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
});
}
class SaveCudaContext {
private:
int saved_device_;
public:
template <typename Functor>
explicit SaveCudaContext (Functor func) : saved_device_{-1} {
// When compiled with CUDA but running on CPU only device,
// cudaGetDevice will fail.
try {
safe_cuda(cudaGetDevice(&saved_device_));
} catch (const dmlc::Error &except) {
saved_device_ = -1;
}
func();
}
~SaveCudaContext() {
if (saved_device_ != -1) {
safe_cuda(cudaSetDevice(saved_device_));
}
}
};
/**
* \class AllReducer
*
@@ -777,8 +801,18 @@ class AllReducer {
allreduce_calls_(0) {}
/**
* \fn void Init(const std::vector<int> &device_ordinals)
*
* \brief If we are using a single GPU only
*/
bool IsSingleGPU() {
#ifdef XGBOOST_USE_NCCL
CHECK(device_counts.size() > 0) << "AllReducer not initialised.";
return device_counts.size() <= 1 && device_counts.at(0) == 1;
#else
return true;
#endif
}
/**
* \brief Initialise with the desired device ordinals for this communication
* group.
*
@@ -956,6 +990,22 @@ class AllReducer {
#endif
};
/**
* \brief Synchronizes the device
*
* \param device_id Identifier for the device.
*/
void Synchronize(int device_id) {
#ifdef XGBOOST_USE_NCCL
SaveCudaContext([&]() {
dh::safe_cuda(cudaSetDevice(device_id));
int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin();
CHECK(idx < device_ordinals.size());
dh::safe_cuda(cudaStreamSynchronize(streams[idx]));
});
#endif
};
#ifdef XGBOOST_USE_NCCL
/**
* \fn ncclUniqueId GetUniqueId()
@@ -980,29 +1030,6 @@ class AllReducer {
#endif
};
class SaveCudaContext {
private:
int saved_device_;
public:
template <typename Functor>
explicit SaveCudaContext (Functor func) : saved_device_{-1} {
// When compiled with CUDA but running on CPU only device,
// cudaGetDevice will fail.
try {
safe_cuda(cudaGetDevice(&saved_device_));
} catch (const dmlc::Error &except) {
saved_device_ = -1;
}
func();
}
~SaveCudaContext() {
if (saved_device_ != -1) {
safe_cuda(cudaSetDevice(saved_device_));
}
}
};
/**
* \brief Executes some operation on each element of the input vector, using a
* single controlling thread for each element. In addition, passes the shard index
@@ -1017,11 +1044,15 @@ class SaveCudaContext {
template <typename T, typename FunctionT>
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
SaveCudaContext{[&]() {
// Temporarily turn off dynamic so we have a guaranteed number of threads
bool dynamic = omp_get_dynamic();
omp_set_dynamic(false);
const long shards_size = static_cast<long>(shards->size());
#pragma omp parallel for schedule(static, 1) if (shards_size > 1)
for (long shard = 0; shard < shards_size; ++shard) {
f(shard, shards->at(shard));
}
omp_set_dynamic(dynamic);
}};
}