Improve update position function for gpu_hist (#3895)

2018-11-14 19:33:29 +13:00 · 2018-11-14 19:33:29 +13:00 · 7af0946ac1
commit 7af0946ac1
parent 143475b27b
3 changed files with 184 additions and 57 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -766,7 +766,8 @@ typename std::iterator_traits<T>::value_type SumReduction(
    dh::CubMemory &tmp_mem, T in, int nVals) {
  using ValueT = typename std::iterator_traits<T>::value_type;
  size_t tmpSize;
-  dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
+  ValueT *dummy_out = nullptr;
  dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
  // Allocate small extra memory for the return value
  tmp_mem.LazyAllocate(tmpSize + sizeof(ValueT));
  auto ptr = reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage) + 1;
@ -1074,4 +1075,71 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
  using IndexT = typename xgboost::common::Span<T>::index_type;
  return ToSpan(vec, static_cast<IndexT>(offset), static_cast<IndexT>(size));
 }
 template <typename FunctionT>
 class LauncherItr {
 public:
  int idx;
  FunctionT f;
  XGBOOST_DEVICE LauncherItr() : idx(0) {}
  XGBOOST_DEVICE LauncherItr(int idx, FunctionT f) : idx(idx), f(f) {}
  XGBOOST_DEVICE LauncherItr &operator=(int output) {
    f(idx, output);
    return *this;
  }
 };
 /**
 * \brief Thrust compatible iterator type - discards algorithm output and launches device lambda
 *        with the index of the output and the algorithm output as arguments.
 *
 * \author  Rory
 * \date  7/9/2017
 *
 * \tparam  FunctionT Type of the function t.
 */
 template <typename FunctionT>
 class DiscardLambdaItr {
 public:
 // Required iterator traits
 using self_type = DiscardLambdaItr;  // NOLINT
 using difference_type = ptrdiff_t;   // NOLINT
 using value_type = void;       // NOLINT
 using pointer = value_type *;  // NOLINT
 using reference = LauncherItr<FunctionT>;  // NOLINT
 using iterator_category = typename thrust::detail::iterator_facade_category<
     thrust::any_system_tag, thrust::random_access_traversal_tag, value_type,
     reference>::type;  // NOLINT
 private:
  difference_type offset_;
  FunctionT f_;
 public:
 XGBOOST_DEVICE explicit DiscardLambdaItr(FunctionT f) : offset_(0), f_(f) {}
 XGBOOST_DEVICE DiscardLambdaItr(difference_type offset, FunctionT f)
     : offset_(offset), f_(f) {}
 XGBOOST_DEVICE self_type operator+(const int &b) const {
   return DiscardLambdaItr(offset_ + b, f_);
  }
  XGBOOST_DEVICE self_type operator++() {
    offset_++;
    return *this;
  }
  XGBOOST_DEVICE self_type operator++(int) {
    self_type retval = *this;
    offset_++;
    return retval;
  }
  XGBOOST_DEVICE self_type &operator+=(const int &b) {
    offset_ += b;
    return *this;
  }
  XGBOOST_DEVICE reference operator*() const {
    return LauncherItr<FunctionT>(offset_, f_);
  }
  XGBOOST_DEVICE reference operator[](int idx) {
    self_type offset = (*this) + idx;
    return *offset;
  }
 };
 }  // namespace dh
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -380,6 +380,53 @@ struct Segment {
  size_t Size() const { return end - begin; }
 };
 /** \brief Returns a one if the left node index is encountered, otherwise return
 * zero. */
 struct IndicateLeftTransform {
  int left_nidx;
  explicit IndicateLeftTransform(int left_nidx) : left_nidx(left_nidx) {}
  __host__ __device__ __forceinline__ int operator()(const int& x) const {
    return x == left_nidx ? 1 : 0;
  }
 };
 /**
 * \brief Optimised routine for sorting key value pairs into left and right
 * segments. Based on a single pass of exclusive scan, uses iterators to
 * redirect inputs and outputs.
 */
 void SortPosition(dh::CubMemory* temp_memory, common::Span<int> position,
                  common::Span<int> position_out, common::Span<bst_uint> ridx,
                  common::Span<bst_uint> ridx_out, int left_nidx,
                  int right_nidx, int64_t left_count) {
  auto d_position_out = position_out.data();
  auto d_position_in = position.data();
  auto d_ridx_out = ridx_out.data();
  auto d_ridx_in = ridx.data();
  auto write_results = [=] __device__(size_t idx, int ex_scan_result) {
    int scatter_address;
    if (d_position_in[idx] == left_nidx) {
      scatter_address = ex_scan_result;
    } else {
      scatter_address = (idx - ex_scan_result) + left_count;
    }
    d_position_out[scatter_address] = d_position_in[idx];
    d_ridx_out[scatter_address] = d_ridx_in[idx];
  };  // NOLINT
  IndicateLeftTransform conversion_op(left_nidx);
  cub::TransformInputIterator<int, IndicateLeftTransform, int*> in_itr(
      d_position_in, conversion_op);
  dh::DiscardLambdaItr<decltype(write_results)> out_itr(write_results);
  size_t temp_storage_bytes = 0;
  cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, in_itr, out_itr,
                                position.size());
  temp_memory->LazyAllocate(temp_storage_bytes);
  cub::DeviceScan::ExclusiveSum(temp_memory->d_temp_storage,
                                temp_memory->temp_storage_bytes, in_itr,
                                out_itr, position.size());
 }
 struct DeviceShard;
 struct GPUHistBuilderBase {
@ -440,26 +487,22 @@ struct DeviceShard {
  TrainParam param;
  bool prediction_cache_initialised;
  int64_t* tmp_pinned;  // Small amount of staging memory
  dh::CubMemory temp_memory;
  std::unique_ptr<GPUHistBuilderBase> hist_builder;
  // TODO(canonizer): do add support multi-batch DMatrix here
-  DeviceShard(int device_id,
+  DeviceShard(int device_id, bst_uint row_begin, bst_uint row_end,
-              bst_uint row_begin, bst_uint row_end, TrainParam _param) :
+              TrainParam _param)
-    device_id_(device_id),
+      : device_id_(device_id),
-    row_begin_idx(row_begin),
+        row_begin_idx(row_begin),
-    row_end_idx(row_end),
+        row_end_idx(row_end),
-    row_stride(0),
+        row_stride(0),
-    n_rows(row_end - row_begin),
+        n_rows(row_end - row_begin),
-    n_bins(0),
+        n_bins(0),
-    null_gidx_value(0),
+        null_gidx_value(0),
-    param(_param),
+        param(_param),
-    prediction_cache_initialised(false),
+        prediction_cache_initialised(false) {}
    tmp_pinned(nullptr)
    {}
  /* Init row_ptrs and row_stride */
  void InitRowPtrs(const SparsePage& row_batch) {
@ -495,7 +538,6 @@ struct DeviceShard {
  void CreateHistIndices(const SparsePage& row_batch);
  ~DeviceShard() {
    dh::safe_cuda(cudaFreeHost(tmp_pinned));
  }
  // Reset values for each update iteration
@ -587,29 +629,18 @@ struct DeviceShard {
           hist.HistogramExists(nidx_parent);
  }
  /*! \brief Count how many rows are assigned to left node. */
  __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
    unsigned ballot = __ballot(val == left_nidx);
    if (threadIdx.x % 32 == 0) {
      atomicAdd(reinterpret_cast<unsigned long long*>(d_count),    // NOLINT
                static_cast<unsigned long long>(__popc(ballot)));  // NOLINT
    }
  }
  void UpdatePosition(int nidx, int left_nidx, int right_nidx, int fidx,
                      int64_t split_gidx, bool default_dir_left, bool is_dense,
                      int fidx_begin,  // cut.row_ptr[fidx]
                      int fidx_end) {  // cut.row_ptr[fidx + 1]
    dh::safe_cuda(cudaSetDevice(device_id_));
    auto d_left_count = temp_memory.GetSpan<int64_t>(1);
    dh::safe_cuda(cudaMemset(d_left_count.data(), 0, sizeof(int64_t)));
    Segment segment = ridx_segments[nidx];
    bst_uint* d_ridx = ridx.Current();
    int* d_position = position.Current();
    common::CompressedIterator<uint32_t> d_gidx = gidx;
    size_t row_stride = this->row_stride;
    // Launch 1 thread for each row
-    dh::LaunchN<1, 512>(
+    dh::LaunchN<1, 128>(
        device_id_, segment.Size(), [=] __device__(bst_uint idx) {
          idx += segment.begin;
          bst_uint ridx = d_ridx[idx];
@ -634,13 +665,16 @@ struct DeviceShard {
            position = default_dir_left ? left_nidx : right_nidx;
          }
          CountLeft(d_left_count.data(), position, left_nidx);
          d_position[idx] = position;
        });
-    dh::safe_cuda(cudaMemcpy(tmp_pinned, d_left_count.data(), sizeof(int64_t),
+    IndicateLeftTransform conversion_op(left_nidx);
-                             cudaMemcpyDeviceToHost));
+    cub::TransformInputIterator<int, IndicateLeftTransform, int*> left_itr(
-    auto left_count = *tmp_pinned;
+        d_position + segment.begin, conversion_op);
-    SortPosition(segment, left_nidx, right_nidx);
+    int left_count = dh::SumReduction(temp_memory, left_itr, segment.Size());
    CHECK_LE(left_count, segment.Size());
    CHECK_GE(left_count, 0);
    SortPositionAndCopy(segment, left_nidx, right_nidx, left_count);
    ridx_segments[left_nidx] =
        Segment(segment.begin, segment.begin + left_count);
@ -649,25 +683,15 @@ struct DeviceShard {
  }
  /*! \brief Sort row indices according to position. */
-  void SortPosition(const Segment& segment, int left_nidx, int right_nidx) {
+  void SortPositionAndCopy(const Segment& segment, int left_nidx, int right_nidx,
-    int min_bits = 0;
+                       size_t left_count) {
-    int max_bits = static_cast<int>(
+    SortPosition(
-        std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1)));
+        &temp_memory,
-
+        common::Span<int>(position.Current() + segment.begin, segment.Size()),
-    size_t temp_storage_bytes = 0;
+        common::Span<int>(position.other() + segment.begin, segment.Size()),
-    cub::DeviceRadixSort::SortPairs(
+        common::Span<bst_uint>(ridx.Current() + segment.begin, segment.Size()),
-        nullptr, temp_storage_bytes,
+        common::Span<bst_uint>(ridx.other() + segment.begin, segment.Size()),
-        position.Current() + segment.begin, position.other() + segment.begin,
+        left_nidx, right_nidx, left_count);
        ridx.Current() + segment.begin, ridx.other() + segment.begin,
        segment.Size(), min_bits, max_bits);
    temp_memory.LazyAllocate(temp_storage_bytes);
    cub::DeviceRadixSort::SortPairs(
        temp_memory.d_temp_storage, temp_memory.temp_storage_bytes,
        position.Current() + segment.begin, position.other() + segment.begin,
        ridx.Current() + segment.begin, ridx.other() + segment.begin,
        segment.Size(), min_bits, max_bits);
    // Copy back key
    dh::safe_cuda(cudaMemcpy(
        position.Current() + segment.begin, position.other() + segment.begin,
@ -823,8 +847,6 @@ inline void DeviceShard::InitCompressedData(
  // Init histogram
  hist.Init(device_id_, hmat.row_ptr.back());
  dh::safe_cuda(cudaMallocHost(&tmp_pinned, sizeof(int64_t)));
 }
 inline void DeviceShard::CreateHistIndices(const SparsePage& row_batch) {
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -327,8 +327,6 @@ TEST(GpuHist, ApplySplit) {
  shard->row_stride = n_cols;
  thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
                   shard->ridx.CurrentDVec().tend());
  // Free inside DeviceShard
  dh::safe_cuda(cudaMallocHost(&(shard->tmp_pinned), sizeof(int64_t)));
  // Initialize GPUHistMaker
  hist_maker.param_ = param;
  RegTree tree;
@ -389,5 +387,44 @@ TEST(GpuHist, ApplySplit) {
  ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
 }
 void TestSortPosition(const std::vector<int>& position_in, int left_idx,
                      int right_idx) {
  int left_count = std::count(position_in.begin(), position_in.end(), left_idx);
  thrust::device_vector<int> position = position_in;
  thrust::device_vector<int> position_out(position.size());
  thrust::device_vector<bst_uint> ridx(position.size());
  thrust::sequence(ridx.begin(), ridx.end());
  thrust::device_vector<bst_uint> ridx_out(ridx.size());
  dh::CubMemory tmp;
  SortPosition(
      &tmp, common::Span<int>(position.data().get(), position.size()),
      common::Span<int>(position_out.data().get(), position_out.size()),
      common::Span<bst_uint>(ridx.data().get(), ridx.size()),
      common::Span<bst_uint>(ridx_out.data().get(), ridx_out.size()), left_idx,
      right_idx, left_count);
  thrust::host_vector<int> position_result = position_out;
  thrust::host_vector<int> ridx_result = ridx_out;
  // Check position is sorted
  EXPECT_TRUE(std::is_sorted(position_result.begin(), position_result.end()));
  // Check row indices are sorted inside left and right segment
  EXPECT_TRUE(
      std::is_sorted(ridx_result.begin(), ridx_result.begin() + left_count));
  EXPECT_TRUE(
      std::is_sorted(ridx_result.begin() + left_count, ridx_result.end()));
  // Check key value pairs are the same
  for (auto i = 0ull; i < ridx_result.size(); i++) {
    EXPECT_EQ(position_result[i], position_in[ridx_result[i]]);
  }
 }
 TEST(GpuHist, SortPosition) {
  TestSortPosition({1, 2, 1, 2, 1}, 1, 2);
  TestSortPosition({1, 1, 1, 1}, 1, 2);
  TestSortPosition({2, 2, 2, 2}, 1, 2);
  TestSortPosition({1, 2, 1, 2, 3}, 1, 2);
 }
 }  // namespace tree
 }  // namespace xgboost