Optimizations for quantisation on device (#4572)

* - do not create device vectors for the entire sparse page while computing histograms... - while creating the compressed histogram indices, the row vector is created for the entire sparse page batch. this is needless as we only process chunks at a time based on a slice of the total gpu memory - this pr will allocate only as much as required to store the ppropriate row indices and the entries * - do not dereference row_ptrs once the device_vector has been created to elide host copies of those counts - instead, grab the entry counts directly from the sparsepage
2019-06-18 15:50:25 -07:00 · 2019-06-18 15:50:25 -07:00 · 6757654337
commit 6757654337
parent ba1d848767
2 changed files with 46 additions and 37 deletions
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -483,8 +483,6 @@ __global__ void CompressBinEllpackKernel(
    const uint32_t* __restrict__ cut_rows,  // HistCutMatrix::row_ptrs
    size_t base_row,                        // batch_row_begin
    size_t n_rows,
-    // row_ptr_begin: row_offset[base_row], the start position of base_row
-    size_t row_ptr_begin,
    size_t row_stride,
    unsigned int null_gidx_value) {
  size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
@ -495,7 +493,7 @@ __global__ void CompressBinEllpackKernel(
  int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
  unsigned int bin = null_gidx_value;
  if (ifeature < row_length) {
-    Entry entry = entries[row_ptrs[irow] - row_ptr_begin + ifeature];
+    Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
    int feature = entry.index;
    float fvalue = entry.fvalue;
    // {feature_cuts, ncuts} forms the array of cuts of `feature'.
@ -697,8 +695,6 @@ struct DeviceShard {
  /*! \brief Sum gradient for each node. */
  std::vector<GradientPair> node_sum_gradients;
  common::Span<GradientPair> node_sum_gradients_d;
-  /*! \brief On-device feature set, only actually used on one of the devices */
-  dh::device_vector<int> feature_set_d;
  dh::device_vector<int64_t>
      left_counts;  // Useful to keep a bunch of zeroed memory for sort position
  /*! The row offset for this shard. */
@ -1311,14 +1307,6 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
  size_t row_stride = this->ellpack_matrix.row_stride;

  const auto &offset_vec = row_batch.offset.ConstHostVector();
-  /*! \brief row offset in SparsePage (the input data). */
-  CHECK_LE(device_row_state.rows_to_process_from_batch, offset_vec.size());
-  dh::device_vector<size_t> row_ptrs(device_row_state.rows_to_process_from_batch+1);
-  thrust::copy(
-    offset_vec.data() + device_row_state.row_offset_in_current_batch,
-    offset_vec.data() + device_row_state.row_offset_in_current_batch +
-    device_row_state.rows_to_process_from_batch + 1,
-    row_ptrs.begin());

  int num_symbols = n_bins + 1;
  // bin and compress entries in batches of rows
@ -1327,7 +1315,6 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
    static_cast<size_t>(device_row_state.rows_to_process_from_batch));
  const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();

-  dh::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
  size_t gpu_nbatches = dh::DivRoundUp(device_row_state.rows_to_process_from_batch,
                                       gpu_batch_nrows);

@ -1339,35 +1326,41 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
    }
    size_t batch_nrows = batch_row_end - batch_row_begin;

+    const auto ent_cnt_begin =
+      offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
+    const auto ent_cnt_end =
+      offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
+
+    /*! \brief row offset in SparsePage (the input data). */
+    dh::device_vector<size_t> row_ptrs(batch_nrows+1);
+    thrust::copy(
+      offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
+      offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
+      row_ptrs.begin());
+
    // number of entries in this batch.
-    size_t n_entries = row_ptrs[batch_row_end] - row_ptrs[batch_row_begin];
+    size_t n_entries = ent_cnt_end - ent_cnt_begin;
+    dh::device_vector<Entry> entries_d(n_entries);
    // copy data entries to device.
    dh::safe_cuda
        (cudaMemcpy
-         (entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin],
+         (entries_d.data().get(), data_vec.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
    const dim3 block3(32, 8, 1);  // 256 threads
-    const dim3 grid3(dh::DivRoundUp(device_row_state.rows_to_process_from_batch, block3.x),
+    const dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
                     dh::DivRoundUp(row_stride, block3.y), 1);
    CompressBinEllpackKernel<<<grid3, block3>>>
        (common::CompressedBufferWriter(num_symbols),
         gidx_buffer.data(),
-         row_ptrs.data().get() + batch_row_begin,
+         row_ptrs.data().get(),
         entries_d.data().get(),
         gidx_fvalue_map.data(),
         feature_segments.data(),
         device_row_state.total_rows_processed + batch_row_begin,
         batch_nrows,
-         row_ptrs[batch_row_begin],
         row_stride,
         null_gidx_value);
  }
-
-  // free the memory that is no longer needed
-  row_ptrs.resize(0);
-  row_ptrs.shrink_to_fit();
-  entries_d.resize(0);
-  entries_d.shrink_to_fit();
 }

 // An instance of this type is created which keeps track of total number of rows to process,
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -477,7 +477,7 @@ TEST(GpuHist, SortPosition) {
  TestSortPosition({1, 2, 1, 2, 3}, 1, 2);
 }

-TEST(GpuHist, TestHistogramIndex) {
+void TestHistogramIndexImpl(int n_gpus) {
  // Test if the compressed histogram index matches when using a sparse
  // dmatrix with and without using external memory

@ -491,23 +491,26 @@ TEST(GpuHist, TestHistogramIndex) {
    CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true));

  std::vector<std::pair<std::string, std::string>> training_params = {
-    {"max_depth", "1"},
+    {"max_depth", "10"},
    {"max_leaves", "0"}
  };

-  LearnerTrainParam learner_param(CreateEmptyGenericParam(0, 1));
+  LearnerTrainParam learner_param(CreateEmptyGenericParam(0, n_gpus));
  hist_maker.Init(training_params, &learner_param);
  hist_maker.InitDataOnce(hist_maker_dmat.get());
  hist_maker_ext.Init(training_params, &learner_param);
  hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());

+  ASSERT_EQ(hist_maker.shards_.size(), hist_maker_ext.shards_.size());
+
  // Extract the device shards from the histogram makers and from that its compressed
  // histogram index
-  const auto &dev_shard = hist_maker.shards_[0];
+  for (size_t i = 0; i < hist_maker.shards_.size(); ++i) {
+    const auto &dev_shard = hist_maker.shards_[i];
    std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
    dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);

-  const auto &dev_shard_ext = hist_maker_ext.shards_[0];
+    const auto &dev_shard_ext = hist_maker_ext.shards_[i];
    std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
    dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);

@ -516,6 +519,19 @@ TEST(GpuHist, TestHistogramIndex) {

    ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
  }
+}
+
+TEST(GpuHist, TestHistogramIndex) {
+  TestHistogramIndexImpl(1);
+}
+
+#if defined(XGBOOST_USE_NCCL)
+TEST(GpuHist, MGPU_TestHistogramIndex) {
+  auto devices = GPUSet::AllVisible();
+  CHECK_GT(devices.Size(), 1);
+  TestHistogramIndexImpl(-1);
+}
+#endif

 }  // namespace tree
 }  // namespace xgboost