[GPU-Plugin] Major refactor (#2644)

* Removal of redundant code/files. * Removal of exact namespace in GPU plugin * Revert double precision histograms to single precision for performance on Maxwell/Kepler
2017-08-30 10:53:52 +12:00
parent 39adba51c5
commit 19a53814ce
26 changed files with 2170 additions and 5637 deletions
--- a/plugin/updater_gpu/src/device_helpers.cuh
+++ b/plugin/updater_gpu/src/device_helpers.cuh
@@ -2,13 +2,11 @@
 * Copyright 2017 XGBoost contributors
 */
 #pragma once
-#include <xgboost/logging.h>
-#include <thrust/binary_search.h>
 #include <thrust/device_vector.h>
-#include <thrust/random.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include <thrust/system_error.h>
+#include <xgboost/logging.h>
 #include <algorithm>
 #include <chrono>
 #include <ctime>
@@ -20,7 +18,6 @@
 #include "nccl.h"

 // Uncomment to enable
-// #define DEVICE_TIMER
 #define TIMERS

 namespace dh {
@@ -61,25 +58,6 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
  return code;
 }

-#define gpuErrchk(ans) \
-  { gpuAssert((ans), __FILE__, __LINE__); }
-
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort){
-      std::stringstream ss;
-      ss << file << "(" << line << ")";
-      std::string file_and_line;
-      ss >> file_and_line;
-      throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
-    }
-  }
-}
-
-
 inline int n_visible_devices() {
  int n_visgpus = 0;

@@ -237,7 +215,8 @@ __device__ range block_stride_range(T begin, T end) {
  return r;
 }

-// Threadblock iterates over range, filling with value. Requires all threads in block to be active.
+// Threadblock iterates over range, filling with value. Requires all threads in
+// block to be active.
 template <typename IterT, typename ValueT>
 __device__ void block_fill(IterT begin, size_t n, ValueT value) {
  for (auto i : block_stride_range(static_cast<size_t>(0), n)) {
@@ -485,7 +464,7 @@ class bulk_allocator {
  }

  template <typename... Args>
-  void allocate(int device_idx, bool silent ,Args... args) {
+  void allocate(int device_idx, bool silent, Args... args) {
    size_t size = get_size_bytes(args...);

    char *ptr = allocate_device(device_idx, size, MemoryT);
@@ -496,8 +475,7 @@ class bulk_allocator {
    _size.push_back(size);
    _device_idx.push_back(device_idx);

-    if(!silent)
-    {
+    if (!silent) {
      const int mb_size = 1048576;
      LOG(CONSOLE) << "Allocated " << size / mb_size << "MB on [" << device_idx
                   << "] " << device_name(device_idx) << ", "
@@ -545,7 +523,6 @@ struct CubMemory {
  bool IsAllocated() { return d_temp_storage != NULL; }
 };

-
 /*
 *  Utility functions
 */
@@ -653,24 +630,6 @@ inline void multi_launch_n(size_t n, int n_devices, L lambda) {
 #endif
 }

-/*
- * Random
- */
-
-struct BernoulliRng {
-  float p;
-  int seed;
-
-  __host__ __device__ BernoulliRng(float p, int seed) : p(p), seed(seed) {}
-
-  __host__ __device__ bool operator()(const int i) const {
-    thrust::default_random_engine rng(seed);
-    thrust::uniform_real_distribution<float> dist;
-    rng.discard(i);
-    return dist(rng) <= p;
-  }
-};
-
 /**
 * @brief Helper macro to measure timing on GPU
 * @param call the GPU call
@@ -687,9 +646,9 @@ struct BernoulliRng {
 // Load balancing search

 template <typename coordinate_t, typename segments_t, typename offset_t>
-void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates, int num_tiles,
-                         int tile_size, segments_t segments, offset_t num_rows,
-                         offset_t num_elements) {
+void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
+                         int num_tiles, int tile_size, segments_t segments,
+                         offset_t num_rows, offset_t num_elements) {
  dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) {
    offset_t diagonal = idx * tile_size;
    coordinate_t tile_coordinate;
@@ -761,8 +720,9 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
 }

 template <typename func_t, typename segments_iter, typename offset_t>
-void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
-                  segments_iter segments, offset_t num_segments, func_t f) {
+void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
+                        offset_t count, segments_iter segments,
+                        offset_t num_segments, func_t f) {
  typedef typename cub::CubVector<offset_t, 2>::Type coordinate_t;
  dh::safe_cuda(cudaSetDevice(device_idx));
  const int BLOCK_THREADS = 256;
@@ -774,8 +734,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t cou
  coordinate_t *tmp_tile_coordinates =
      reinterpret_cast<coordinate_t *>(temp_memory->d_temp_storage);

-  FindMergePartitions(device_idx, tmp_tile_coordinates, num_tiles, BLOCK_THREADS, segments,
-                      num_segments, count);
+  FindMergePartitions(device_idx, tmp_tile_coordinates, num_tiles,
+                      BLOCK_THREADS, segments, num_segments, count);

  LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
      <<<num_tiles, BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
@@ -783,22 +743,24 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t cou
 }

 template <typename func_t, typename offset_t>
-void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments, func_t f) {
+void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
+                       func_t f) {
  CHECK(count % num_segments == 0) << "Data is not dense.";

-  launch_n(device_idx, count, [=]__device__(offset_t idx)
-  {
+  launch_n(device_idx, count, [=] __device__(offset_t idx) {
    offset_t segment = idx / (count / num_segments);
    f(idx, segment);
  });
 }

 /**
- * \fn  template <typename func_t, typename segments_iter, typename offset_t> void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count, segments_iter segments, offset_t num_segments, bool is_dense, func_t f)
+ * \fn  template <typename func_t, typename segments_iter, typename offset_t>
+ * void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
+ * segments_iter segments, offset_t num_segments, bool is_dense, func_t f)
 *
- * \brief Load balancing search function. Reads a CSR type matrix description and allows a function
- *        to be executed on each element. Search 'modern GPU load balancing search' for more
- *        information.
+ * \brief Load balancing search function. Reads a CSR type matrix description
+ * and allows a function to be executed on each element. Search 'modern GPU load
+ * balancing search' for more information.
 *
 * \author  Rory
 * \date  7/9/2017
@@ -817,12 +779,106 @@ void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments, fu

 template <typename func_t, typename segments_iter, typename offset_t>
 void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
-  segments_iter segments, offset_t num_segments, bool is_dense, func_t f) {
+                  segments_iter segments, offset_t num_segments, bool is_dense,
+                  func_t f) {
  if (is_dense) {
    DenseTransformLbs(device_idx, count, num_segments, f);
-  }
-  else {
-    SparseTransformLbs(device_idx, temp_memory, count, segments, num_segments, f);
+  } else {
+    SparseTransformLbs(device_idx, temp_memory, count, segments, num_segments,
+                       f);
  }
 }
+
+/**
+ * @brief Helper function to sort the pairs using cub's segmented RadixSortPairs
+ * @param tmp_mem cub temporary memory info
+ * @param keys keys double-buffer array
+ * @param vals the values double-buffer array
+ * @param nVals number of elements in the array
+ * @param nSegs number of segments
+ * @param offsets the segments
+ */
+template <typename T1, typename T2>
+void segmentedSort(dh::CubMemory *tmp_mem, dh::dvec2<T1> *keys,
+                   dh::dvec2<T2> *vals, int nVals, int nSegs,
+                   const dh::dvec<int> &offsets, int start = 0,
+                   int end = sizeof(T1) * 8) {
+  size_t tmpSize;
+  dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+      NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.data(),
+      offsets.data() + 1, start, end));
+  tmp_mem->LazyAllocate(tmpSize);
+  dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+      tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals,
+      nSegs, offsets.data(), offsets.data() + 1, start, end));
+}
+
+/**
+ * @brief Helper function to perform device-wide sum-reduction
+ * @param tmp_mem cub temporary memory info
+ * @param in the input array to be reduced
+ * @param out the output reduced value
+ * @param nVals number of elements in the input array
+ */
+template <typename T>
+void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
+                  int nVals) {
+  size_t tmpSize;
+  dh::safe_cuda(
+      cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
+  tmp_mem.LazyAllocate(tmpSize);
+  dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
+                                       in.data(), out.data(), nVals));
+}
+
+/**
+ * @brief Fill a given constant value across all elements in the buffer
+ * @param out the buffer to be filled
+ * @param len number of elements i the buffer
+ * @param def default value to be filled
+ */
+template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
+void fillConst(int device_idx, T *out, int len, T def) {
+  dh::launch_n<ItemsPerThread, BlkDim>(device_idx, len,
+                                       [=] __device__(int i) { out[i] = def; });
+}
+
+/**
+ * @brief gather elements
+ * @param out1 output gathered array for the first buffer
+ * @param in1 first input buffer
+ * @param out2 output gathered array for the second buffer
+ * @param in2 second input buffer
+ * @param instId gather indices
+ * @param nVals length of the buffers
+ */
+template <typename T1, typename T2, int BlkDim = 256, int ItemsPerThread = 4>
+void gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
+            const int *instId, int nVals) {
+  dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
+                                       [=] __device__(int i) {
+                                         int iid = instId[i];
+                                         T1 v1 = in1[iid];
+                                         T2 v2 = in2[iid];
+                                         out1[i] = v1;
+                                         out2[i] = v2;
+                                       });
+}
+
+/**
+ * @brief gather elements
+ * @param out output gathered array
+ * @param in input buffer
+ * @param instId gather indices
+ * @param nVals length of the buffers
+ */
+template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
+void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
+  dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
+                                       [=] __device__(int i) {
+                                         int iid = instId[i];
+                                         out[i] = in[iid];
+                                       });
+}
+
 }  // namespace dh