Gradient based sampling for GPU Hist (#5093)

* Implement gradient based sampling for GPU Hist tree method. * Add samplers and handle compacted page in GPU Hist.
2020-02-03 18:31:27 -08:00
parent c74216f22c
commit e4b74c4d22
18 changed files with 1187 additions and 175 deletions
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -64,6 +64,20 @@ __global__ void CompressBinEllpackKernel(
  wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature);
 }

+// Construct an ELLPACK matrix with the given number of empty rows.
+EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) {
+  monitor_.Init("ellpack_page");
+  dh::safe_cuda(cudaSetDevice(device));
+
+  matrix.info = info;
+  matrix.base_rowid = 0;
+  matrix.n_rows = n_rows;
+
+  monitor_.StartCuda("InitCompressedData");
+  InitCompressedData(device, n_rows);
+  monitor_.StopCuda("InitCompressedData");
+}
+
 // Construct an ELLPACK matrix in memory.
 EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
  monitor_.Init("ellpack_page");
@@ -96,6 +110,85 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
  monitor_.StopCuda("BinningCompression");
 }

+// A functor that copies the data from one EllpackPage to another.
+struct CopyPage {
+  common::CompressedBufferWriter cbw;
+  common::CompressedByteT* dst_data_d;
+  common::CompressedIterator<uint32_t> src_iterator_d;
+  // The number of elements to skip.
+  size_t offset;
+
+  CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
+      : cbw{dst->matrix.info.NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+        offset(offset) {}
+
+  __device__ void operator()(size_t element_id) {
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
+  }
+};
+
+// Copy the data from the given EllpackPage to the current page.
+size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
+  monitor_.StartCuda("Copy");
+  size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
+  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
+  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
+  CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements);
+  dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
+  monitor_.StopCuda("Copy");
+  return num_elements;
+}
+
+// A functor that compacts the rows from one EllpackPage into another.
+struct CompactPage {
+  common::CompressedBufferWriter cbw;
+  common::CompressedByteT* dst_data_d;
+  common::CompressedIterator<uint32_t> src_iterator_d;
+  /*! \brief An array that maps the rows from the full DMatrix to the compacted page.
+   *
+   * The total size is the number of rows in the original, uncompacted DMatrix. Elements are the
+   * row ids in the compacted page. Rows not needed are set to SIZE_MAX.
+   *
+   * An example compacting 16 rows to 8 rows:
+   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX,
+   * SIZE_MAX]
+   */
+  common::Span<size_t> row_indexes;
+  size_t base_rowid;
+  size_t row_stride;
+
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span<size_t> row_indexes)
+      : cbw{dst->matrix.info.NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+        row_indexes(row_indexes),
+        base_rowid{src->matrix.base_rowid},
+        row_stride{src->matrix.info.row_stride} {}
+
+  __device__ void operator()(size_t row_id) {
+    size_t src_row = base_rowid + row_id;
+    size_t dst_row = row_indexes[src_row];
+    if (dst_row == SIZE_MAX) return;
+    size_t dst_offset = dst_row * row_stride;
+    size_t src_offset = row_id * row_stride;
+    for (size_t j = 0; j < row_stride; j++) {
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
+    }
+  }
+};
+
+// Compacts the data from the given EllpackPage into the current page.
+void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes) {
+  monitor_.StartCuda("Compact");
+  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
+  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
+  CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size());
+  dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes));
+  monitor_.StopCuda("Compact");
+}
+
 // Construct an EllpackInfo based on histogram cuts of features.
 EllpackInfo::EllpackInfo(int device,
                         bool is_dense,
@@ -123,16 +216,14 @@ void EllpackPageImpl::InitInfo(int device,

 // Initialize the buffer to stored compressed features.
 void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) {
-  size_t num_symbols = matrix.info.n_bins + 1;
+  size_t num_symbols = matrix.info.NumSymbols();

  // Required buffer size for storing data matrix in ELLPack format.
  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
      matrix.info.row_stride * num_rows, num_symbols);
  ba_.Allocate(device, &gidx_buffer, compressed_size_bytes);

-  thrust::fill(
-      thrust::device_pointer_cast(gidx_buffer.data()),
-      thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);
+  thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);

  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
 }
@@ -149,7 +240,6 @@ void EllpackPageImpl::CreateHistIndices(int device,

  const auto& offset_vec = row_batch.offset.ConstHostVector();

-  int num_symbols = matrix.info.n_bins + 1;
  // bin and compress entries in batches of rows
  size_t gpu_batch_nrows = std::min(
      dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
@@ -193,7 +283,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
                     1);
    dh::LaunchKernel {grid3, block3} (
        CompressBinEllpackKernel,
-        common::CompressedBufferWriter(num_symbols),
+        common::CompressedBufferWriter(matrix.info.NumSymbols()),
        gidx_buffer.data(),
        row_ptrs.data().get(),
        entries_d.data().get(),
@@ -254,11 +344,9 @@ void EllpackPageImpl::CompressSparsePage(int device) {

 // Return the memory cost for storing the compressed features.
 size_t EllpackPageImpl::MemCostBytes() const {
-  size_t num_symbols = matrix.info.n_bins + 1;
-
  // Required buffer size for storing data matrix in ELLPack format.
  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
-      matrix.info.row_stride * matrix.n_rows, num_symbols);
+      matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols());
  return compressed_size_bytes;
 }

@@ -280,5 +368,4 @@ void EllpackPageImpl::InitDevice(int device, EllpackInfo info) {

  device_initialized_ = true;
 }
-
 }  // namespace xgboost