Multi-GPU HostDeviceVector. (#3287)

* Multi-GPU HostDeviceVector.

- HostDeviceVector instances can now span multiple devices, defined by GPUSet struct
- the interface of HostDeviceVector has been modified accordingly
- GPU objective functions are now multi-GPU
- GPU predicting from cache is now multi-GPU
- avoiding omp_set_num_threads() calls
- other minor changes
This commit is contained in:
Andrew V. Adinetz
2018-05-04 22:00:05 +02:00
committed by Rory Mitchell
parent 90a5c4db9d
commit b8a0d66fe6
9 changed files with 569 additions and 250 deletions

View File

@@ -1004,14 +1004,29 @@ class AllReducer {
template <typename T, typename FunctionT>
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
auto previous_num_threads = omp_get_max_threads();
omp_set_num_threads(shards->size());
#pragma omp parallel
{
auto cpu_thread_id = omp_get_thread_num();
f(shards->at(cpu_thread_id));
#pragma omp parallel for schedule(static, 1)
for (int shard = 0; shard < shards->size(); ++shard) {
f(shards->at(shard));
}
}
/**
* \brief Executes some operation on each element of the input vector, using a
* single controlling thread for each element. In addition, passes the shard index
* into the function.
*
* \tparam T Generic type parameter.
* \tparam FunctionT Type of the function t.
* \param shards The shards.
* \param f The func_t to process.
*/
template <typename T, typename FunctionT>
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
#pragma omp parallel for schedule(static, 1)
for (int shard = 0; shard < shards->size(); ++shard) {
f(shard, shards->at(shard));
}
omp_set_num_threads(previous_num_threads);
}
/**
@@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {
template <typename ReduceT,typename T, typename FunctionT>
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
auto previous_num_threads = omp_get_max_threads();
omp_set_num_threads(shards->size());
std::vector<ReduceT> sums(shards->size());
#pragma omp parallel
{
auto cpu_thread_id = omp_get_thread_num();
sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
#pragma omp parallel for schedule(static, 1)
for (int shard = 0; shard < shards->size(); ++shard) {
sums[shard] = f(shards->at(shard));
}
omp_set_num_threads(previous_num_threads);
return std::accumulate(sums.begin(), sums.end(), ReduceT());
}
} // namespace dh