[EM] Multi-level quantile sketching for GPU. (#10813)

2024-09-10 13:08:34 +08:00 · 2024-09-10 13:08:34 +08:00 · ed5f33df16
commit ed5f33df16
parent 3ef8383d93
6 changed files with 111 additions and 34 deletions
--- a/demo/guide-python/quantile_data_iterator.py
+++ b/demo/guide-python/quantile_data_iterator.py
@ -105,17 +105,21 @@ def main():
    assert m_with_it.num_row() == m.num_row()
    # Tree meethod must be `hist`.
    reg_with_it = xgboost.train(
-        {"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
+        {"tree_method": "hist", "device": "cuda"},
+        m_with_it,
+        num_boost_round=rounds,
+        evals=[(m_with_it, "Train")],
    )
    predict_with_it = reg_with_it.predict(m_with_it)

    reg = xgboost.train(
-        {"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
+        {"tree_method": "hist", "device": "cuda"},
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
    )
    predict = reg.predict(m)

-    numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)
-

 if __name__ == "__main__":
    main()
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -143,17 +143,30 @@ class SketchContainer {
   */
  void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
            common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
-  /* \brief Prune the quantile structure.
+  /**
+   * @brief Prune the quantile structure.
   *
-   * \param to The maximum size of pruned quantile.  If the size of quantile
-   * structure is already less than `to`, then no operation is performed.
+   * @param to The maximum size of pruned quantile.  If the size of quantile structure is
+   *           already less than `to`, then no operation is performed.
   */
  void Prune(Context const* ctx, size_t to);
-  /* \brief Merge another set of sketch.
-   * \param that columns of other.
+  /**
+   * @brief Merge another set of sketch.
+   *
+   * @param that_columns_ptr Column pointer of the quantile summary being merged.
+   * @param that Columns of the other quantile summary.
   */
  void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
             Span<SketchEntry const> that);
+  /**
+   * @brief Shrink the internal data structure to reduce memory usage. Can be used after
+   *        prune.
+   */
+  void ShrinkToFit() {
+    this->Current().shrink_to_fit();
+    this->Other().clear();
+    this->Other().shrink_to_fit();
+  }

  /* \brief Merge quantiles from other GPU workers. */
  void AllReduce(Context const* ctx, bool is_column_split);
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@ -3,6 +3,7 @@
 */
 #include <algorithm>  // for max
 #include <numeric>    // for partial_sum
+#include <utility>    // for pair
 #include <vector>     // for vector

 #include "../collective/allreduce.h"    // for Allreduce
@ -29,11 +30,39 @@ void MakeSketches(Context const* ctx,
                  float missing, std::shared_ptr<common::HistogramCuts> cuts, MetaInfo const& info,
                  ExternalDataInfo* p_ext_info) {
  xgboost_NVTX_FN_RANGE();
-
-  std::unique_ptr<common::SketchContainer> sketch;
+  /**
+   * A variant of: A Fast Algorithm for Approximate Quantiles in High Speed Data Streams
+   *
+   * The original algorithm was designed for CPU where input is a stream with individual
+   * elements. For GPU, we process the data in batches. As a result, the implementation
+   * here simply uses the user input batch as the basic unit of sketching blocks. The
+   * number of blocks per-level grows exponentially.
+   */
+  std::vector<std::pair<std::unique_ptr<common::SketchContainer>, bst_idx_t>> sketches;
  auto& ext_info = *p_ext_info;

+  auto lazy_init_sketch = [&] {
+    // Lazy because we need the `n_features`.
+    sketches.emplace_back(std::make_unique<common::SketchContainer>(
+                              proxy->Info().feature_types, p.max_bin, ext_info.n_features,
+                              data::BatchSamples(proxy), dh::GetDevice(ctx)),
+                          0);
+  };
+
+  // Workaround empty input with CPU ctx.
+  Context new_ctx;
+  Context const* p_ctx;
+  if (ctx->IsCUDA()) {
+    p_ctx = ctx;
+  } else {
+    new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
+    p_ctx = &new_ctx;
+  }
+
  do {
+    /**
+     * Get the data shape.
+     */
    // We use do while here as the first batch is fetched in ctor
    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
    common::SetDevice(dh::GetDevice(ctx).ordinal);
@ -46,28 +75,38 @@ void MakeSketches(Context const* ctx,
      CHECK_EQ(ext_info.n_features, ::xgboost::data::BatchColumns(proxy))
          << "Inconsistent number of columns.";
    }
+
+    auto batch_rows = data::BatchSamples(proxy);
+    ext_info.accumulated_rows += batch_rows;
+
+    /**
+     * Handle sketching.
+     */
    if (!ref) {
-      if (!sketch) {
-        sketch = std::make_unique<common::SketchContainer>(
-            proxy->Info().feature_types, p.max_bin, ext_info.n_features, data::BatchSamples(proxy),
-            dh::GetDevice(ctx));
+      if (sketches.empty()) {
+        lazy_init_sketch();
+      }
+      if (sketches.back().second > (1ul << (sketches.size() - 1))) {
+        auto n_cuts_per_feat =
+            common::detail::RequiredSampleCutsPerColumn(p.max_bin, ext_info.accumulated_rows);
+        // Prune to a single block
+        sketches.back().first->Prune(p_ctx, n_cuts_per_feat);
+        sketches.back().first->ShrinkToFit();
+
+        sketches.back().second = 1;
+        lazy_init_sketch();  // Add a new level.
      }
      proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
      cuda_impl::Dispatch(proxy, [&](auto const& value) {
-        // Workaround empty input with CPU ctx.
-        Context new_ctx;
-        Context const* p_ctx;
-        if (ctx->IsCUDA()) {
-          p_ctx = ctx;
-        } else {
-          new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
-          p_ctx = &new_ctx;
-        }
-        common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get());
+        common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing,
+                                    sketches.back().first.get());
+        sketches.back().second++;
      });
    }
-    auto batch_rows = data::BatchSamples(proxy);
-    ext_info.accumulated_rows += batch_rows;
+
+    /**
+     * Rest of the data shape.
+     */
    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
    ext_info.row_stride =
@ -87,7 +126,28 @@ void MakeSketches(Context const* ctx,
  // Get reference
  common::SetDevice(dh::GetDevice(ctx).ordinal);
  if (!ref) {
-    sketch->MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
+    HostDeviceVector<FeatureType> ft;
+    common::SketchContainer final_sketch(
+        sketches.empty() ? ft : sketches.front().first->FeatureTypes(), p.max_bin,
+        ext_info.n_features, ext_info.accumulated_rows, dh::GetDevice(ctx));
+    // Reverse order since the last container might contain summary that's not yet pruned.
+    for (auto it = sketches.crbegin(); it != sketches.crend(); ++it) {
+      auto& sketch = *it;
+
+      CHECK_GE(sketch.second, 1);
+      if (sketch.second > 1) {
+        sketch.first->Prune(p_ctx, common::detail::RequiredSampleCutsPerColumn(
+                                       p.max_bin, ext_info.accumulated_rows));
+        sketch.first->ShrinkToFit();
+      }
+      final_sketch.Merge(p_ctx, sketch.first->ColumnsPtr(), sketch.first->Data());
+      final_sketch.FixError();
+    }
+
+    sketches.clear();
+    sketches.shrink_to_fit();
+
+    final_sketch.MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
  } else {
    GetCutsFromRef(ctx, ref, ext_info.n_features, p, cuts.get());
  }
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@ -289,11 +289,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
        auto page = std::make_shared<S>();
        this->exce_.Run([&] {
          std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{
-              this->CreatePageFormat(this->param_)};
+              self->CreatePageFormat(self->param_)};
          auto name = self->cache_info_->ShardName();
          auto [offset, length] = self->cache_info_->View(fetch_it);
          std::unique_ptr<typename FormatStreamPolicy::ReaderT> fi{
-              this->CreateReader(name, offset, length)};
+              self->CreateReader(name, offset, length)};
          CHECK(fmt->Read(page.get(), fi.get()));
        });
        return page;
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@ -17,6 +17,7 @@

 #include "../../../include/xgboost/logging.h"
 #include "../../../src/common/cuda_context.cuh"
+#include "../../../src/common/cuda_rt_utils.h"  // for SetDevice
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
@ -59,8 +60,7 @@ TEST(HistUtil, SketchBatchNumElements) {
  GTEST_SKIP_("Test not runnable with RMM enabled.");
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  size_t constexpr kCols = 10000;
-  int device;
-  dh::safe_cuda(cudaGetDevice(&device));
+  std::int32_t device = dh::CurrentDevice();
  auto avail = static_cast<size_t>(dh::AvailableMemory(device) * 0.8);
  auto per_elem = detail::BytesPerElement(false);
  auto avail_elem = avail / per_elem;
@ -576,7 +576,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {

 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  common::SetDevice(ctx->Ordinal());
  auto n = n_samples * n_features;
  std::vector<float> x;
  x.resize(n);
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@ -13,7 +13,7 @@

 namespace xgboost::data {
 void TestEquivalent(float sparsity) {
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);

  CudaArrayIterForTest iter{sparsity};
  IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,