Support optimal partitioning for GPU hist. (#7652)

* Implement `MaxCategory` in quantile. * Implement partition-based split for GPU evaluation. Currently, it's based on the existing evaluation function. * Extract an evaluator from GPU Hist to store the needed states. * Added some CUDA stream/event utilities. * Update document with references. * Fixed a bug in approx evaluator where the number of data points is less than the number of categories.
2022-02-15 03:03:12 +08:00
parent 2369d55e9a
commit 0d0abe1845
26 changed files with 1088 additions and 528 deletions
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -16,6 +16,10 @@

 namespace xgboost {
 namespace common {
+
+using CatBitField = LBitField32;
+using KCatBitField = CLBitField32;
+
 // Cast the categorical type.
 template <typename T>
 XGBOOST_DEVICE bst_cat_t AsCat(T const& v) {
@@ -57,6 +61,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat
  if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
    return dft_left;
  }
+
+  auto pos = KCatBitField::ToBitPos(cat);
+  if (pos.int_pos >= cats.size()) {
+    return true;
+  }
  return !s_cats.Check(AsCat(cat));
 }

@@ -73,18 +82,14 @@ inline void InvalidCategory() {
 /*!
 * \brief Whether should we use onehot encoding for categorical data.
 */
-inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
-  bool use_one_hot = n_cats < max_cat_to_onehot ||
-                     (task.task != ObjInfo::kRegression && task.task != ObjInfo::kBinary);
+XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
+  bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
  return use_one_hot;
 }

 struct IsCatOp {
  XGBOOST_DEVICE bool operator()(FeatureType ft) { return ft == FeatureType::kCategorical; }
 };
-
-using CatBitField = LBitField32;
-using KCatBitField = CLBitField32;
 }  // namespace common
 }  // namespace xgboost

--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -952,22 +952,22 @@ thrust::device_ptr<T const> tcend(xgboost::HostDeviceVector<T> const& vector) {
 }

 template <typename T>
-thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) {  // NOLINT
+XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) {  // NOLINT
  return thrust::device_ptr<T>(span.data());
 }

 template <typename T>
-thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) {  // NOLINT
+XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) {  // NOLINT
  return thrust::device_ptr<T>(span.data());
 }

 template <typename T>
-thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) {  // NOLINT
+XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) {  // NOLINT
  return tbegin(span) + span.size();
 }

 template <typename T>
-thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) {  // NOLINT
+XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) {  // NOLINT
  return tbegin(span) + span.size();
 }

@@ -982,12 +982,12 @@ XGBOOST_DEVICE auto trend(xgboost::common::Span<T> &span) {  // NOLINT
 }

 template <typename T>
-thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) {  // NOLINT
+XGBOOST_DEVICE thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) {  // NOLINT
  return thrust::device_ptr<T const>(span.data());
 }

 template <typename T>
-thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) {  // NOLINT
+XGBOOST_DEVICE thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) {  // NOLINT
  return tcbegin(span) + span.size();
 }

@@ -1536,4 +1536,69 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 }
+
+class CUDAStreamView;
+
+class CUDAEvent {
+  cudaEvent_t event_{nullptr};
+
+ public:
+  CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
+  ~CUDAEvent() {
+    if (event_) {
+      dh::safe_cuda(cudaEventDestroy(event_));
+    }
+  }
+
+  CUDAEvent(CUDAEvent const &that) = delete;
+  CUDAEvent &operator=(CUDAEvent const &that) = delete;
+
+  inline void Record(CUDAStreamView stream);  // NOLINT
+
+  operator cudaEvent_t() const { return event_; }  // NOLINT
+};
+
+class CUDAStreamView {
+  cudaStream_t stream_{nullptr};
+
+ public:
+  explicit CUDAStreamView(cudaStream_t s) : stream_{s} {}
+  void Wait(CUDAEvent const &e) {
+#if defined(__CUDACC_VER_MAJOR__)
+#if __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0
+    // CUDA == 11.0
+    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, 0));
+#else
+    // CUDA > 11.0
+    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
+#endif  // __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0:
+#else   // clang
+    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
+#endif  //  defined(__CUDACC_VER_MAJOR__)
+  }
+  operator cudaStream_t() const {  // NOLINT
+    return stream_;
+  }
+  void Sync() { dh::safe_cuda(cudaStreamSynchronize(stream_)); }
+};
+
+inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
+  dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
+}
+
+inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
+
+class CUDAStream {
+  cudaStream_t stream_;
+
+ public:
+  CUDAStream() {
+    dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+  }
+  ~CUDAStream() {
+    dh::safe_cuda(cudaStreamDestroy(stream_));
+  }
+
+  CUDAStreamView View() const { return CUDAStreamView{stream_}; }
+};
 }  // namespace dh
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -33,66 +33,84 @@ namespace common {
 */
 using GHistIndexRow = Span<uint32_t const>;

-// A CSC matrix representing histogram cuts, used in CPU quantile hist.
+// A CSC matrix representing histogram cuts.
 // The cut values represent upper bounds of bins containing approximately equal numbers of elements
 class HistogramCuts {
+  bool has_categorical_{false};
+  float max_cat_{-1.0f};
+
 protected:
  using BinIdx = uint32_t;

+  void Swap(HistogramCuts&& that) noexcept(true) {
+    std::swap(cut_values_, that.cut_values_);
+    std::swap(cut_ptrs_, that.cut_ptrs_);
+    std::swap(min_vals_, that.min_vals_);
+
+    std::swap(has_categorical_, that.has_categorical_);
+    std::swap(max_cat_, that.max_cat_);
+  }
+
+  void Copy(HistogramCuts const& that) {
+    cut_values_.Resize(that.cut_values_.Size());
+    cut_ptrs_.Resize(that.cut_ptrs_.Size());
+    min_vals_.Resize(that.min_vals_.Size());
+    cut_values_.Copy(that.cut_values_);
+    cut_ptrs_.Copy(that.cut_ptrs_);
+    min_vals_.Copy(that.min_vals_);
+    has_categorical_ = that.has_categorical_;
+    max_cat_ = that.max_cat_;
+  }
+
 public:
-  HostDeviceVector<bst_float> cut_values_;  // NOLINT
-  HostDeviceVector<uint32_t> cut_ptrs_;     // NOLINT
+  HostDeviceVector<float> cut_values_;   // NOLINT
+  HostDeviceVector<uint32_t> cut_ptrs_;  // NOLINT
  // storing minimum value in a sketch set.
  HostDeviceVector<float> min_vals_;  // NOLINT

  HistogramCuts();
-  HistogramCuts(HistogramCuts const& that) {
-    cut_values_.Resize(that.cut_values_.Size());
-    cut_ptrs_.Resize(that.cut_ptrs_.Size());
-    min_vals_.Resize(that.min_vals_.Size());
-    cut_values_.Copy(that.cut_values_);
-    cut_ptrs_.Copy(that.cut_ptrs_);
-    min_vals_.Copy(that.min_vals_);
-  }
+  HistogramCuts(HistogramCuts const& that) { this->Copy(that); }

  HistogramCuts(HistogramCuts&& that) noexcept(true) {
-    *this = std::forward<HistogramCuts&&>(that);
+    this->Swap(std::forward<HistogramCuts>(that));
  }

  HistogramCuts& operator=(HistogramCuts const& that) {
-    cut_values_.Resize(that.cut_values_.Size());
-    cut_ptrs_.Resize(that.cut_ptrs_.Size());
-    min_vals_.Resize(that.min_vals_.Size());
-    cut_values_.Copy(that.cut_values_);
-    cut_ptrs_.Copy(that.cut_ptrs_);
-    min_vals_.Copy(that.min_vals_);
+    this->Copy(that);
    return *this;
  }

  HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
-    cut_ptrs_ = std::move(that.cut_ptrs_);
-    cut_values_ = std::move(that.cut_values_);
-    min_vals_ = std::move(that.min_vals_);
+    this->Swap(std::forward<HistogramCuts>(that));
    return *this;
  }

-  uint32_t FeatureBins(uint32_t feature) const {
-    return cut_ptrs_.ConstHostVector().at(feature + 1) -
-           cut_ptrs_.ConstHostVector()[feature];
+  uint32_t FeatureBins(bst_feature_t feature) const {
+    return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
  }

-  // Getters.  Cuts should be of no use after building histogram indices, but currently
-  // they are deeply linked with quantile_hist, gpu sketcher and gpu_hist, so we preserve
-  // these for now.
  std::vector<uint32_t> const& Ptrs()      const { return cut_ptrs_.ConstHostVector();   }
  std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
  std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }

+  bool HasCategorical() const { return has_categorical_; }
+  float MaxCategory() const { return max_cat_; }
+  /**
+   * \brief Set meta info about categorical features.
+   *
+   * \param has_cat Do we have categorical feature in the data?
+   * \param max_cat The maximum categorical value in all features.
+   */
+  void SetCategorical(bool has_cat, float max_cat) {
+    has_categorical_ = has_cat;
+    max_cat_ = max_cat;
+  }
+
  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }

  // Return the index of a cut point that is strictly greater than the input
  // value, or the last available index if none exists
-  BinIdx SearchBin(float value, uint32_t column_id, std::vector<uint32_t> const& ptrs,
+  BinIdx SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
                   std::vector<float> const& values) const {
    auto end = ptrs[column_id + 1];
    auto beg = ptrs[column_id];
@@ -102,7 +120,7 @@ class HistogramCuts {
    return idx;
  }

-  BinIdx SearchBin(float value, uint32_t column_id) const {
+  BinIdx SearchBin(float value, bst_feature_t column_id) const {
    return this->SearchBin(value, column_id, Ptrs(), Values());
  }

--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -272,7 +272,7 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread

  // move all categories into a flatten vector to prepare for allreduce
  size_t total = feature_ptr.back();
-  std::vector<bst_cat_t> flatten(total, 0);
+  std::vector<float> flatten(total, 0);
  auto cursor{flatten.begin()};
  for (auto const &feat : categories) {
    cursor = std::copy(feat.cbegin(), feat.cend(), cursor);
@@ -287,15 +287,15 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
  auto gtotal = global_worker_ptr.back();

  // categories in all workers with all features.
-  std::vector<bst_cat_t> global_categories(gtotal, 0);
+  std::vector<float> global_categories(gtotal, 0);
  auto rank_begin = global_worker_ptr[rank];
  auto rank_size = global_worker_ptr[rank + 1] - rank_begin;
  CHECK_EQ(rank_size, total);
  std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
  // gather values from all workers.
  rabit::Allreduce<rabit::op::Sum>(global_categories.data(), global_categories.size());
-  QuantileAllreduce<bst_cat_t> allreduce_result{global_categories, global_worker_ptr,
-                                                global_feat_ptrs, categories.size()};
+  QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
+                                            categories.size()};
  ParallelFor(categories.size(), n_threads, [&](auto fidx) {
    if (!IsCat(feature_types, fidx)) {
      return;
@@ -531,6 +531,22 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
        InvalidCategory();
      }
    }
+    auto const &ptrs = cuts->Ptrs();
+    auto const &vals = cuts->Values();
+
+    float max_cat{-std::numeric_limits<float>::infinity()};
+    for (size_t i = 1; i < ptrs.size(); ++i) {
+      if (IsCat(feature_types_, i - 1)) {
+        auto beg = ptrs[i - 1];
+        auto end = ptrs[i];
+        auto feat = Span<float const>{vals}.subspan(beg, end - beg);
+        auto max_elem = *std::max_element(feat.cbegin(), feat.cend());
+        if (max_elem > max_cat) {
+          max_cat = max_elem;
+        }
+      }
+    }
+    cuts->SetCategorical(true, max_cat);
  }

  monitor_.Stop(__func__);
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -1,22 +1,23 @@
 /*!
 * Copyright 2020 by XGBoost Contributors
 */
-#include <thrust/unique.h>
-#include <thrust/iterator/discard_iterator.h>
 #include <thrust/binary_search.h>
-#include <thrust/transform_scan.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/transform_scan.h>
+#include <thrust/unique.h>

+#include <limits>  // std::numeric_limits
 #include <memory>
 #include <utility>

-#include "xgboost/span.h"
-#include "quantile.h"
-#include "quantile.cuh"
-#include "hist_util.h"
-#include "device_helpers.cuh"
 #include "categorical.h"
 #include "common.h"
+#include "device_helpers.cuh"
+#include "hist_util.h"
+#include "quantile.cuh"
+#include "quantile.h"
+#include "xgboost/span.h"

 namespace xgboost {
 namespace common {
@@ -586,7 +587,7 @@ struct InvalidCatOp {
  Span<uint32_t const> ptrs;
  Span<FeatureType const> ft;

-  XGBOOST_DEVICE bool operator()(size_t i) {
+  XGBOOST_DEVICE bool operator()(size_t i) const {
    auto fidx = dh::SegmentId(ptrs, i);
    return IsCat(ft, fidx) && InvalidCat(values[i]);
  }
@@ -683,18 +684,36 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
    out_column[idx] = in_column[idx+1].value;
  });

+  float max_cat{-1.0f};
  if (has_categorical_) {
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    auto ptrs = p_cuts->cut_ptrs_.ConstDeviceSpan();
-    auto it = thrust::make_counting_iterator(0ul);
+    auto invalid_op = InvalidCatOp{out_cut_values, d_out_columns_ptr, d_ft};
+    auto it = dh::MakeTransformIterator<thrust::pair<bool, float>>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
+          auto fidx = dh::SegmentId(d_out_columns_ptr, i);
+          if (IsCat(d_ft, fidx)) {
+            auto invalid = invalid_op(i);
+            auto v = out_cut_values[i];
+            return thrust::make_pair(invalid, v);
+          }
+          return thrust::make_pair(false, std::numeric_limits<float>::min());
+        });

-    CHECK_EQ(p_cuts->Ptrs().back(), out_cut_values.size());
-    auto invalid = thrust::any_of(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
-                                  InvalidCatOp{out_cut_values, ptrs, d_ft});
+    bool invalid{false};
+    dh::XGBCachingDeviceAllocator<char> alloc;
+    thrust::tie(invalid, max_cat) =
+        thrust::reduce(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
+                       thrust::make_pair(false, std::numeric_limits<float>::min()),
+                       [=] XGBOOST_DEVICE(thrust::pair<bool, bst_cat_t> const &l,
+                                          thrust::pair<bool, bst_cat_t> const &r) {
+                         return thrust::make_pair(l.first || r.first, std::max(l.second, r.second));
+                       });
    if (invalid) {
      InvalidCategory();
    }
  }
+
+  p_cuts->SetCategorical(this->has_categorical_, max_cat);
+
  timer_.Stop(__func__);
 }
 }  // namespace common