Prepare gradient index for Quantile DMatrix. (#8103)

* Prepare gradient index for Quantile DMatrix. - Implement push batch with adapter batch. - Implement `GetFvalue` for prediction.
2022-07-22 17:26:33 +08:00
parent 1be09848a7
commit 4a4e5c7c18
7 changed files with 254 additions and 70 deletions
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -4,13 +4,17 @@
 */
 #ifndef XGBOOST_DATA_GRADIENT_INDEX_H_
 #define XGBOOST_DATA_GRADIENT_INDEX_H_
+
+#include <algorithm>  // std::min
 #include <memory>
 #include <vector>

 #include "../common/categorical.h"
 #include "../common/hist_util.h"
+#include "../common/numeric.h"
 #include "../common/threading_utils.h"
 #include "adapter.h"
+#include "proxy_dmatrix.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"

@@ -18,7 +22,6 @@ namespace xgboost {
 namespace common {
 class ColumnMatrix;
 }  // namespace common
-
 /*!
 * \brief preprocessed global index matrix, in CSR format
 *
@@ -26,24 +29,39 @@ class ColumnMatrix;
 *  index for CPU histogram.  On GPU ellpack page is used.
 */
 class GHistIndexMatrix {
+  // Get the size of each row
+  template <typename AdapterBatchT>
+  auto GetRowCounts(AdapterBatchT const& batch, float missing, int32_t n_threads) {
+    std::vector<size_t> valid_counts(batch.Size(), 0);
+    common::ParallelFor(batch.Size(), n_threads, [&](size_t i) {
+      auto line = batch.GetLine(i);
+      for (size_t j = 0; j < line.Size(); ++j) {
+        data::COOTuple elem = line.GetElement(j);
+        if (data::IsValidFunctor {missing}(elem)) {
+          valid_counts[i]++;
+        }
+      }
+    });
+    return valid_counts;
+  }
+
  /**
   * \brief Push a page into index matrix, the function is only necessary because hist has
   *        partial support for external memory.
   */
-  void PushBatch(SparsePage const& batch, common::Span<FeatureType const> ft,
-                 bst_bin_t n_total_bins, int32_t n_threads);
+  void PushBatch(SparsePage const& batch, common::Span<FeatureType const> ft, int32_t n_threads);

  template <typename Batch, typename BinIdxType, typename GetOffset, typename IsValid>
-  void SetIndexData(common::Span<BinIdxType> index_data_span, common::Span<FeatureType const> ft,
-                    size_t batch_threads, Batch const& batch, IsValid&& is_valid, size_t nbins,
-                    GetOffset&& get_offset) {
+  void SetIndexData(common::Span<BinIdxType> index_data_span, size_t rbegin,
+                    common::Span<FeatureType const> ft, size_t batch_threads, Batch const& batch,
+                    IsValid&& is_valid, size_t nbins, GetOffset&& get_offset) {
    auto batch_size = batch.Size();
    BinIdxType* index_data = index_data_span.data();
    auto const& ptrs = cut.Ptrs();
    auto const& values = cut.Values();
    common::ParallelFor(batch_size, batch_threads, [&](size_t i) {
      auto line = batch.GetLine(i);
-      size_t ibegin = row_ptr[i];  // index of first entry for current block
+      size_t ibegin = row_ptr[rbegin + i];  // index of first entry for current block
      size_t k = 0;
      auto tid = omp_get_thread_num();
      for (size_t j = 0; j < line.Size(); ++j) {
@@ -63,6 +81,49 @@ class GHistIndexMatrix {
    });
  }

+  template <typename Batch, typename IsValid>
+  void PushBatchImpl(int32_t n_threads, Batch const& batch, size_t rbegin, IsValid&& is_valid,
+                     common::Span<FeatureType const> ft) {
+    // The number of threads is pegged to the batch size. If the OMP block is parallelized
+    // on anything other than the batch/block size, it should be reassigned
+    size_t batch_threads =
+        std::max(static_cast<size_t>(1), std::min(batch.Size(), static_cast<size_t>(n_threads)));
+
+    auto n_bins_total = cut.TotalBins();
+    const size_t n_index = row_ptr[rbegin + batch.Size()];  // number of entries in this page
+    ResizeIndex(n_index, isDense_);
+    if (isDense_) {
+      index.SetBinOffset(cut.Ptrs());
+    }
+    uint32_t const* offsets = index.Offset();
+    if (isDense_) {
+      // Inside the lambda functions, bin_idx is the index for cut value across all
+      // features. By subtracting it with starting pointer of each feature, we can reduce
+      // it to smaller value and compress it to smaller types.
+      common::DispatchBinType(index.GetBinTypeSize(), [&](auto dtype) {
+        using T = decltype(dtype);
+        common::Span<T> index_data_span = {index.data<T>(), index.Size()};
+        SetIndexData(
+            index_data_span, rbegin, ft, batch_threads, batch, is_valid, n_bins_total,
+            [offsets](auto bin_idx, auto fidx) { return static_cast<T>(bin_idx - offsets[fidx]); });
+      });
+    } else {
+      /* For sparse DMatrix we have to store index of feature for each bin
+         in index field to chose right offset. So offset is nullptr and index is
+         not reduced */
+      common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
+      SetIndexData(index_data_span, rbegin, ft, batch_threads, batch, is_valid, n_bins_total,
+                   [](auto idx, auto) { return idx; });
+    }
+
+    common::ParallelFor(n_bins_total, n_threads, [&](bst_omp_uint idx) {
+      for (int32_t tid = 0; tid < n_threads; ++tid) {
+        hit_count[idx] += hit_count_tloc_[tid * n_bins_total + idx];
+        hit_count_tloc_[tid * n_bins_total + idx] = 0;  // reset for next batch
+      }
+    });
+  }
+
 public:
  /*! \brief row pointer to rows by element position */
  std::vector<size_t> row_ptr;
@@ -77,15 +138,53 @@ class GHistIndexMatrix {
  /*! \brief base row index for current page (used by external memory) */
  size_t base_rowid{0};

-  GHistIndexMatrix();
+  ~GHistIndexMatrix();
+  /**
+   * \brief Constrcutor for SimpleDMatrix.
+   */
  GHistIndexMatrix(DMatrix* x, bst_bin_t max_bins_per_feat, double sparse_thresh,
                   bool sorted_sketch, int32_t n_threads, common::Span<float> hess = {});
-  ~GHistIndexMatrix();
+  /**
+   * \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
+   *        for push batch.
+   */
+  GHistIndexMatrix(MetaInfo const& info, common::HistogramCuts&& cuts, bst_bin_t max_bin_per_feat);
+  /**
+   * \brief Constructor for external memory.
+   */
+  GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
+                   common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
+                   double sparse_thresh, int32_t n_threads);
+  GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.

-  // Create a global histogram matrix, given cut. Used by external memory
-  void Init(SparsePage const& page, common::Span<FeatureType const> ft,
-            common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
-            double sparse_thresh, int32_t n_threads);
+  template <typename Batch>
+  void PushAdapterBatch(Context const* ctx, size_t rbegin, size_t prev_sum, Batch const& batch,
+                        float missing, common::Span<FeatureType const> ft, double sparse_thresh,
+                        size_t n_samples_total) {
+    auto n_bins_total = cut.TotalBins();
+    hit_count_tloc_.clear();
+    hit_count_tloc_.resize(ctx->Threads() * n_bins_total, 0);
+
+    auto n_threads = ctx->Threads();
+    auto valid_counts = GetRowCounts(batch, missing, n_threads);
+
+    auto it = common::MakeIndexTransformIter([&](size_t ridx) { return valid_counts[ridx]; });
+    common::PartialSum(n_threads, it, it + batch.Size(), prev_sum, row_ptr.begin() + rbegin);
+    auto is_valid = data::IsValidFunctor{missing};
+
+    PushBatchImpl(ctx->Threads(), batch, rbegin, is_valid, ft);
+
+    if (rbegin + batch.Size() == n_samples_total) {
+      // finished
+      CHECK(!std::isnan(sparse_thresh));
+      this->columns_ = std::make_unique<common::ColumnMatrix>(*this, sparse_thresh);
+    }
+  }
+
+  // Call ColumnMatrix::PushBatch
+  template <typename Batch>
+  void PushAdapterBatchColumns(Context const* ctx, Batch const& batch, float missing,
+                               size_t rbegin);

  void ResizeIndex(const size_t n_index, const bool isDense);

@@ -117,6 +216,8 @@ class GHistIndexMatrix {

  common::ColumnMatrix const& Transpose() const;

+  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+
 private:
  std::unique_ptr<common::ColumnMatrix> columns_;
  std::vector<size_t> hit_count_tloc_;