Improved gpu_hist_experimental algorithm (#2866)

- Implement colsampling, subsampling for gpu_hist_experimental - Optimised multi-GPU implementation for gpu_hist_experimental - Make nccl optional - Add Volta architecture flag - Optimise RegLossObj - Add timing utilities for debug verbose mode - Bump required cuda version to 8.0
2017-11-11 13:58:40 +13:00
parent 16c63f30d0
commit 40c6e2f0c8
14 changed files with 855 additions and 473 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -15,7 +15,10 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
+#ifdef XGBOOST_USE_NCCL
 #include "nccl.h"
+#endif

 // Uncomment to enable
 #define TIMERS
@@ -44,6 +47,7 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
  return code;
 }

+#ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)

 inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
@@ -57,6 +61,7 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,

  return code;
 }
+#endif

 template <typename T>
 T *raw(thrust::device_vector<T> &v) {  //  NOLINT
@@ -137,6 +142,19 @@ inline int get_device_idx(int gpu_id) {
  return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
 }

+inline void check_compute_capability() {
+  int n_devices = n_visible_devices();
+  for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+    cudaDeviceProp prop;
+    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
+    std::ostringstream oss;
+    oss << "CUDA Capability Major/Minor version number: " << prop.major << "."
+        << prop.minor << " is insufficient.  Need >=3.5";
+    int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
+    if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
+  }
+}
+
 /*
 * Range iterator
 */
@@ -241,37 +259,12 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
  }

  safe_cuda(cudaSetDevice(device_idx));
-  const int GRID_SIZE = static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
+  const int GRID_SIZE =
+      static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
  launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
                                                lambda);
 }

-/*
- *  Timers
- */
-
-struct Timer {
-  typedef std::chrono::high_resolution_clock ClockT;
-  typedef std::chrono::high_resolution_clock::time_point TimePointT;
-  typedef std::chrono::high_resolution_clock::duration DurationT;
-  typedef std::chrono::duration<double> SecondsT;
-
-  TimePointT start;
-  DurationT elapsed;
-  Timer() { Reset(); }
-  void Reset() {
-    elapsed = DurationT::zero();
-    Start();
-  }
-  void Start() { start = ClockT::now(); }
-  void Stop() { elapsed += ClockT::now() - start; }
-  double ElapsedSeconds() const { return SecondsT(elapsed).count(); }
-  void PrintElapsed(std::string label) {
-    printf("%s:\t %fs\n", label.c_str(), SecondsT(elapsed).count());
-    Reset();
-  }
-};
-
 /*
 * Memory
 */
@@ -444,7 +437,7 @@ class bulk_allocator {

  template <typename T>
  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
-                            size_t first_size) {
+                     size_t first_size) {
    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
                                 first_size);
  }
@@ -470,8 +463,7 @@ class bulk_allocator {

  template <typename T, typename... Args>
  size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
-    return get_size_bytes<T>(first_vec, first_size) +
-           get_size_bytes(args...);
+    return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
  }

  template <typename T>
@@ -497,6 +489,7 @@ class bulk_allocator {
      if (!(d_ptr[i] == nullptr)) {
        safe_cuda(cudaSetDevice(_device_idx[i]));
        safe_cuda(cudaFree(d_ptr[i]));
+        d_ptr[i] = nullptr;
      }
    }
  }
@@ -642,7 +635,8 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,

  for (auto item : dh::block_stride_range(int(0), int(tile_num_rows + 1))) {
    temp_storage.tile_segment_end_offsets[item] =
-        segment_end_offsets[min(tile_start_coord.x + item, num_segments - 1)];
+        segment_end_offsets[min(static_cast<size_t>(tile_start_coord.x + item),
+                                static_cast<size_t>(num_segments - 1))];
  }
  __syncthreads();

@@ -693,8 +687,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
                      BLOCK_THREADS, segments, num_segments, count);

  LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
-      <<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
-                                     num_segments);
+      <<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
+                                               segments + 1, f, num_segments);
 }

 template <typename func_t, typename offset_t>
@@ -836,4 +830,125 @@ void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
                                       });
 }

+/**
+ * \class AllReducer
+ *
+ * \brief All reducer class that manages its own communication group and
+ * streams. Must be initialised before use. If XGBoost is compiled without NCCL this is a dummy class that will error if used with more than one GPU.
+ */
+
+class AllReducer {
+  bool initialised;
+#ifdef XGBOOST_USE_NCCL
+  std::vector<ncclComm_t> comms;
+  std::vector<cudaStream_t> streams;
+  std::vector<int> device_ordinals;
+#endif
+ public:
+  AllReducer() : initialised(false) {}
+
+  /**
+   * \fn  void Init(const std::vector<int> &device_ordinals)
+   *
+   * \brief Initialise with the desired device ordinals for this communication
+   * group.
+   *
+   * \param device_ordinals The device ordinals.
+   */
+
+  void Init(const std::vector<int> &device_ordinals) {
+#ifdef XGBOOST_USE_NCCL
+    this->device_ordinals = device_ordinals;
+    comms.resize(device_ordinals.size());
+    dh::safe_nccl(ncclCommInitAll(comms.data(),
+                                  static_cast<int>(device_ordinals.size()),
+                                  device_ordinals.data()));
+    streams.resize(device_ordinals.size());
+    for (size_t i = 0; i < device_ordinals.size(); i++) {
+      safe_cuda(cudaSetDevice(device_ordinals[i]));
+      safe_cuda(cudaStreamCreate(&streams[i]));
+    }
+    initialised = true;
+#else
+    CHECK_EQ(device_ordinals.size(), 1) << "XGBoost must be compiled with NCCL to use more than one GPU.";
+#endif
+  }
+  ~AllReducer() {
+#ifdef XGBOOST_USE_NCCL
+    if (initialised) {
+      for (auto &stream : streams) {
+        dh::safe_cuda(cudaStreamDestroy(stream));
+      }
+      for (auto &comm : comms) {
+        ncclCommDestroy(comm);
+      }
+    }
+#endif
+  }
+
+  /**
+   * \fn  void AllReduceSum(int communication_group_idx, const double *sendbuff,
+   * double *recvbuff, int count)
+   *
+   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
+   * streams or comms.
+   *
+   * \param           communication_group_idx Zero-based index of the
+   * communication group. \param sendbuff                The sendbuff. \param
+   * sendbuff                The sendbuff. \param [in,out]  recvbuff
+   * The recvbuff. \param           count                   Number of.
+   */
+
+  void AllReduceSum(int communication_group_idx, const double *sendbuff,
+                    double *recvbuff, int count) {
+#ifdef XGBOOST_USE_NCCL
+    CHECK(initialised);
+
+    dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
+                                comms[communication_group_idx],
+                                streams[communication_group_idx]));
+#endif
+  }
+
+  /**
+   * \fn  void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, int64_t *recvbuff, int count)
+   *
+   * \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
+   *
+   * \param           communication_group_idx Zero-based index of the communication group. \param
+   *                                          sendbuff                The sendbuff. \param sendbuff
+   *                                          The sendbuff. \param [in,out]  recvbuff The recvbuff.
+   *                                          \param           count                   Number of.
+   * \param           sendbuff                The sendbuff.
+   * \param [in,out]  recvbuff                If non-null, the recvbuff.
+   * \param           count                   Number of.
+   */
+
+  void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
+                    int64_t *recvbuff, int count) {
+#ifdef XGBOOST_USE_NCCL
+    CHECK(initialised);
+
+    dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum,
+                                comms[communication_group_idx],
+                                streams[communication_group_idx]));
+#endif
+  }
+
+  /**
+   * \fn  void Synchronize()
+   *
+   * \brief Synchronizes the entire communication group.
+   */
+  void Synchronize() {
+#ifdef XGBOOST_USE_NCCL
+    for (int i = 0; i < device_ordinals.size(); i++) {
+      dh::safe_cuda(cudaSetDevice(device_ordinals[i]));
+      dh::safe_cuda(cudaStreamSynchronize(streams[i]));
+    }
+#endif
+  }
+};
 }  // namespace dh
--- a/src/common/timer.h
+++ b/src/common/timer.h
@@ -0,0 +1,77 @@
+/*!
+ * Copyright by Contributors 2017
+ */
+#pragma once
+#include <chrono>
+#include <iostream>
+#include <map>
+#include <string>
+
+namespace xgboost {
+namespace common {
+struct Timer {
+  typedef std::chrono::high_resolution_clock ClockT;
+  typedef std::chrono::high_resolution_clock::time_point TimePointT;
+  typedef std::chrono::high_resolution_clock::duration DurationT;
+  typedef std::chrono::duration<double> SecondsT;
+
+  TimePointT start;
+  DurationT elapsed;
+  Timer() { Reset(); }
+  void Reset() {
+    elapsed = DurationT::zero();
+    Start();
+  }
+  void Start() { start = ClockT::now(); }
+  void Stop() { elapsed += ClockT::now() - start; }
+  double ElapsedSeconds() const { return SecondsT(elapsed).count(); }
+  void PrintElapsed(std::string label) {
+    printf("%s:\t %fs\n", label.c_str(), SecondsT(elapsed).count());
+    Reset();
+  }
+};
+
+/**
+ * \struct  Monitor
+ *
+ * \brief Timing utility used to measure total method execution time over the
+ * lifetime of the containing object.
+ */
+
+struct Monitor {
+  bool debug_verbose = false;
+  std::string label = "";
+  std::map<std::string, Timer> timer_map;
+  Timer self_timer;
+
+  Monitor() { self_timer.Start(); }
+
+  ~Monitor() {
+    if (!debug_verbose) return;
+
+    std::cout << "========\n";
+    std::cout << "Monitor: " << label << "\n";
+    std::cout << "========\n";
+    for (auto &kv : timer_map) {
+      kv.second.PrintElapsed(kv.first);
+    }
+    self_timer.Stop();
+    self_timer.PrintElapsed(label + " Lifetime");
+  }
+  void Init(std::string label, bool debug_verbose) {
+    this->debug_verbose = debug_verbose;
+    this->label = label;
+  }
+  void Start(const std::string &name) { timer_map[name].Start(); }
+  void Stop(const std::string &name) {
+    if (debug_verbose) {
+#ifdef __CUDACC__
+#include "device_helpers.cuh"
+      dh::synchronize_all();
+#endif
+    }
+    timer_map[name].Stop();
+  }
+};
+}  // namespace common
+}  // namespace xgboost