Implement GPU accelerated coordinate descent algorithm (#3178)

* Implement GPU accelerated coordinate descent algorithm. * Exclude external memory tests for GPU
2018-04-20 14:56:35 +12:00
parent ccf80703ef
commit a185ddfe03
12 changed files with 473 additions and 63 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -374,10 +374,10 @@ class DVec {
    safe_cuda(cudaSetDevice(this->DeviceIdx()));
    if (end - begin != Size()) {
      throw std::runtime_error(
-                               "Cannot copy assign vector to DVec, sizes are different");
+          "Cannot copy assign vector to dvec, sizes are different");
    }
-    safe_cuda(cudaMemcpy(this->Data(), begin.get(),
-                         Size() * sizeof(T), cudaMemcpyDefault));
+    safe_cuda(cudaMemcpy(this->Data(), begin.get(), Size() * sizeof(T),
+                         cudaMemcpyDefault));
  }
 };

@@ -544,7 +544,7 @@ struct CubMemory {
  size_t temp_storage_bytes;

  // Thrust
-   using ValueT = char;
+  using value_type = char;  // NOLINT

  CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}

@@ -807,18 +807,20 @@ void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
 * @param nVals number of elements in the input array
 */
 template <typename T>
-T SumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
+typename std::iterator_traits<T>::value_type SumReduction(dh::CubMemory &tmp_mem, T in, int nVals) {
+  using ValueT = typename std::iterator_traits<T>::value_type;
  size_t tmpSize;
  dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
  // Allocate small extra memory for the return value
-  tmp_mem.LazyAllocate(tmpSize + sizeof(T));
-  auto ptr = reinterpret_cast<T *>(tmp_mem.d_temp_storage) + 1;
+  tmp_mem.LazyAllocate(tmpSize + sizeof(ValueT));
+  auto ptr = reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage) + 1;
  dh::safe_cuda(cub::DeviceReduce::Sum(
-    reinterpret_cast<void *>(ptr), tmpSize, in,
-    reinterpret_cast<T *>(tmp_mem.d_temp_storage), nVals));
-  T sum;
-  dh::safe_cuda(cudaMemcpy(&sum, tmp_mem.d_temp_storage, sizeof(T),
-    cudaMemcpyDeviceToHost));
+      reinterpret_cast<void *>(ptr), tmpSize, in,
+      reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage),
+      nVals));
+  ValueT sum;
+  dh::safe_cuda(cudaMemcpy(&sum, tmp_mem.d_temp_storage, sizeof(ValueT),
+                           cudaMemcpyDeviceToHost));
  return sum;
 }

@@ -876,7 +878,8 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
 * \class AllReducer
 *
 * \brief All reducer class that manages its own communication group and
- * streams. Must be initialised before use. If XGBoost is compiled without NCCL this is a dummy class that will error if used with more than one GPU.
+ * streams. Must be initialised before use. If XGBoost is compiled without NCCL
+ * this is a dummy class that will error if used with more than one GPU.
 */

 class AllReducer {
@@ -912,7 +915,8 @@ class AllReducer {
    }
    initialised = true;
 #else
-    CHECK_EQ(device_ordinals.size(), 1) << "XGBoost must be compiled with NCCL to use more than one GPU.";
+    CHECK_EQ(device_ordinals.size(), 1)
+        << "XGBoost must be compiled with NCCL to use more than one GPU.";
 #endif
  }
  ~AllReducer() {
@@ -929,16 +933,13 @@ class AllReducer {
  }

  /**
-   * \fn  void AllReduceSum(int communication_group_idx, const double *sendbuff,
-   * double *recvbuff, int count)
-   *
   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
   * streams or comms.
   *
-   * \param           communication_group_idx Zero-based index of the
-   * communication group. \param sendbuff                The sendbuff. \param
-   * sendbuff                The sendbuff. \param [in,out]  recvbuff
-   * The recvbuff. \param           count                   Number of.
+   * \param communication_group_idx Zero-based index of the communication group.
+   * \param sendbuff                The sendbuff.
+   * \param recvbuff                The recvbuff.
+   * \param count                   Number of elements.
   */

  void AllReduceSum(int communication_group_idx, const double *sendbuff,
@@ -954,17 +955,14 @@ class AllReducer {
  }

  /**
-   * \fn  void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, int64_t *recvbuff, int count)
-   *
   * \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
   *
-   * \param           communication_group_idx Zero-based index of the communication group. \param
-   *                                          sendbuff                The sendbuff. \param sendbuff
-   *                                          The sendbuff. \param [in,out]  recvbuff The recvbuff.
-   *                                          \param           count                   Number of.
-   * \param           sendbuff                The sendbuff.
-   * \param [in,out]  recvbuff                If non-null, the recvbuff.
-   * \param           count                   Number of.
+   * \param count Number of.
+   *
+   * \param communication_group_idx Zero-based index of the communication group. \param sendbuff.
+   * \param sendbuff                The sendbuff.
+   * \param recvbuff                The recvbuff.
+   * \param count                   Number of.
   */

  void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
@@ -993,4 +991,53 @@ class AllReducer {
 #endif
  }
 };
+
+/**
+ * \brief Executes some operation on each element of the input vector, using a
+ * single controlling thread for each element.
+ *
+ * \tparam  T       Generic type parameter.
+ * \tparam  FunctionT  Type of the function t.
+ * \param shards  The shards.
+ * \param f       The func_t to process.
+ */
+
+template <typename T, typename FunctionT>
+void ExecuteShards(std::vector<T> *shards, FunctionT f) {
+  auto previous_num_threads = omp_get_max_threads();
+  omp_set_num_threads(shards->size());
+#pragma omp parallel
+  {
+    auto cpu_thread_id = omp_get_thread_num();
+    f(shards->at(cpu_thread_id));
+  }
+  omp_set_num_threads(previous_num_threads);
+}
+
+/**
+ * \brief Executes some operation on each element of the input vector, using a single controlling
+ *        thread for each element, returns the sum of the results.
+ *
+ * \tparam  ReduceT  Type of the reduce t.
+ * \tparam  T         Generic type parameter.
+ * \tparam  FunctionT    Type of the function t.
+ * \param shards  The shards.
+ * \param f       The func_t to process.
+ *
+ * \return  A reduce_t.
+ */
+
+template <typename ReduceT,typename T, typename FunctionT>
+ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
+  auto previous_num_threads = omp_get_max_threads();
+  omp_set_num_threads(shards->size());
+  std::vector<ReduceT> sums(shards->size());
+#pragma omp parallel
+  {
+    auto cpu_thread_id = omp_get_thread_num();
+    sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
+  }
+  omp_set_num_threads(previous_num_threads);
+  return std::accumulate(sums.begin(), sums.end(), ReduceT());
+}
 }  // namespace dh