Multi-GPU HostDeviceVector. (#3287)
* Multi-GPU HostDeviceVector. - HostDeviceVector instances can now span multiple devices, defined by GPUSet struct - the interface of HostDeviceVector has been modified accordingly - GPU objective functions are now multi-GPU - GPU predicting from cache is now multi-GPU - avoiding omp_set_num_threads() calls - other minor changes
This commit is contained in:
committed by
Rory Mitchell
parent
90a5c4db9d
commit
b8a0d66fe6
@@ -1004,14 +1004,29 @@ class AllReducer {
|
||||
|
||||
template <typename T, typename FunctionT>
|
||||
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
||||
auto previous_num_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(shards->size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
f(shards->at(cpu_thread_id));
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
f(shards->at(shard));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Executes some operation on each element of the input vector, using a
|
||||
* single controlling thread for each element. In addition, passes the shard index
|
||||
* into the function.
|
||||
*
|
||||
* \tparam T Generic type parameter.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \param shards The shards.
|
||||
* \param f The func_t to process.
|
||||
*/
|
||||
|
||||
template <typename T, typename FunctionT>
|
||||
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
f(shard, shards->at(shard));
|
||||
}
|
||||
omp_set_num_threads(previous_num_threads);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
||||
|
||||
template <typename ReduceT,typename T, typename FunctionT>
|
||||
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
|
||||
auto previous_num_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(shards->size());
|
||||
std::vector<ReduceT> sums(shards->size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
sums[shard] = f(shards->at(shard));
|
||||
}
|
||||
omp_set_num_threads(previous_num_threads);
|
||||
return std::accumulate(sums.begin(), sums.end(), ReduceT());
|
||||
}
|
||||
} // namespace dh
|
||||
|
||||
Reference in New Issue
Block a user