Multi-GPU HostDeviceVector. (#3287)

* Multi-GPU HostDeviceVector. - HostDeviceVector instances can now span multiple devices, defined by GPUSet struct - the interface of HostDeviceVector has been modified accordingly - GPU objective functions are now multi-GPU - GPU predicting from cache is now multi-GPU - avoiding omp_set_num_threads() calls - other minor changes
2018-05-04 22:00:05 +02:00
parent 90a5c4db9d
commit b8a0d66fe6
9 changed files with 569 additions and 250 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1004,14 +1004,29 @@ class AllReducer {

 template <typename T, typename FunctionT>
 void ExecuteShards(std::vector<T> *shards, FunctionT f) {
-  auto previous_num_threads = omp_get_max_threads();
-  omp_set_num_threads(shards->size());
-#pragma omp parallel
-  {
-    auto cpu_thread_id = omp_get_thread_num();
-    f(shards->at(cpu_thread_id));
+#pragma omp parallel for schedule(static, 1)
+  for (int shard = 0; shard < shards->size(); ++shard) {
+    f(shards->at(shard));
+  }
+}
+
+/**
+ * \brief Executes some operation on each element of the input vector, using a
+ * single controlling thread for each element. In addition, passes the shard index
+ * into the function.
+ *
+ * \tparam  T       Generic type parameter.
+ * \tparam  FunctionT  Type of the function t.
+ * \param shards  The shards.
+ * \param f       The func_t to process.
+ */
+
+template <typename T, typename FunctionT>
+void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
+#pragma omp parallel for schedule(static, 1)
+  for (int shard = 0; shard < shards->size(); ++shard) {
+    f(shard, shards->at(shard));
  }
-  omp_set_num_threads(previous_num_threads);
 }

 /**
@@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {

 template <typename ReduceT,typename T, typename FunctionT>
 ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
-  auto previous_num_threads = omp_get_max_threads();
-  omp_set_num_threads(shards->size());
  std::vector<ReduceT> sums(shards->size());
-#pragma omp parallel
-  {
-    auto cpu_thread_id = omp_get_thread_num();
-    sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
+#pragma omp parallel for schedule(static, 1)
+  for (int shard = 0; shard < shards->size(); ++shard) {
+    sums[shard] = f(shards->at(shard));
  }
-  omp_set_num_threads(previous_num_threads);
  return std::accumulate(sums.begin(), sums.end(), ReduceT());
 }
 }  // namespace dh