Move device histogram storage into histogram.cuh. (#10608)

2024-07-21 14:10:13 +08:00 · 2024-07-21 14:10:13 +08:00 · 6d9fcb771e
commit 6d9fcb771e
parent cb62f9e73b
6 changed files with 171 additions and 167 deletions
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@ -6,6 +6,8 @@
 #include <memory>  // for unique_ptr
 #include "../../common/cuda_context.cuh"    // for CUDAContext
 #include "../../common/device_helpers.cuh"  // for LaunchN
 #include "../../common/device_vector.cuh"   // for device_vector
 #include "../../data/ellpack_page.cuh"      // for EllpackDeviceAccessor
 #include "feature_groups.cuh"               // for FeatureGroupsAccessor
 #include "xgboost/base.h"                   // for GradientPair, GradientPairInt64
@ -60,6 +62,111 @@ class GradientQuantiser {
  }
 };
 /**
 * @brief Data storage for node histograms on device. Automatically expands.
 *
 * @tparam kStopGrowingSize  Do not grow beyond this size
 *
 * @author  Rory
 * @date    28/07/2018
 */
 template <size_t kStopGrowingSize = 1 << 28>
 class DeviceHistogramStorage {
 private:
  using GradientSumT = GradientPairInt64;
  /** @brief Map nidx to starting index of its histogram. */
  std::map<int, size_t> nidx_map_;
  // Large buffer of zeroed memory, caches histograms
  dh::device_vector<typename GradientSumT::ValueT> data_;
  // If we run out of storage allocate one histogram at a time
  // in overflow. Not cached, overwritten when a new histogram
  // is requested
  dh::device_vector<typename GradientSumT::ValueT> overflow_;
  std::map<int, size_t> overflow_nidx_map_;
  int n_bins_;
  DeviceOrd device_id_;
  static constexpr size_t kNumItemsInGradientSum =
      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
 public:
  // Start with about 16mb
  DeviceHistogramStorage() { data_.reserve(1 << 22); }
  void Init(DeviceOrd device_id, int n_bins) {
    this->n_bins_ = n_bins;
    this->device_id_ = device_id;
  }
  void Reset(Context const* ctx) {
    auto d_data = data_.data().get();
    dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
                [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
    nidx_map_.clear();
    overflow_nidx_map_.clear();
  }
  [[nodiscard]] bool HistogramExists(int nidx) const {
    return nidx_map_.find(nidx) != nidx_map_.cend() ||
           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
  }
  [[nodiscard]] int Bins() const { return n_bins_; }
  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
  void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
    for (int nidx : new_nidxs) {
      CHECK(!HistogramExists(nidx));
    }
    // Number of items currently used in data
    const size_t used_size = nidx_map_.size() * HistogramSize();
    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
    if (used_size >= kStopGrowingSize) {
      // Use overflow
      // Delete previous entries
      overflow_nidx_map_.clear();
      overflow_.resize(HistogramSize() * new_nidxs.size());
      // Zero memory
      auto d_data = overflow_.data().get();
      dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(),
                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
      // Append new histograms
      for (int nidx : new_nidxs) {
        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
      }
    } else {
      CHECK_GE(data_.size(), used_size);
      // Expand if necessary
      if (data_.size() < new_used_size) {
        data_.resize(std::max(data_.size() * 2, new_used_size));
      }
      // Append new histograms
      for (int nidx : new_nidxs) {
        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
      }
    }
    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
  }
  /**
   * \summary   Return pointer to histogram memory for a given node.
   * \param nidx    Tree node index.
   * \return    hist pointer.
   */
  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
    CHECK(this->HistogramExists(nidx));
    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
      // Fetch from normal cache
      auto ptr = data_.data().get() + nidx_map_.at(nidx);
      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
    } else {
      // Fetch from overflow
      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
    }
  }
 };
 class DeviceHistogramBuilderImpl;
 class DeviceHistogramBuilder {
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -49,113 +49,6 @@ namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)
 /**
 * \struct  DeviceHistogramStorage
 *
 * \summary Data storage for node histograms on device. Automatically expands.
 *
 * \tparam GradientSumT      histogram entry type.
 * \tparam kStopGrowingSize  Do not grow beyond this size
 *
 * \author  Rory
 * \date    28/07/2018
 */
 template <size_t kStopGrowingSize = 1 << 28>
 class DeviceHistogramStorage {
 private:
  using GradientSumT = GradientPairInt64;
  /*! \brief Map nidx to starting index of its histogram. */
  std::map<int, size_t> nidx_map_;
  // Large buffer of zeroed memory, caches histograms
  dh::device_vector<typename GradientSumT::ValueT> data_;
  // If we run out of storage allocate one histogram at a time
  // in overflow. Not cached, overwritten when a new histogram
  // is requested
  dh::device_vector<typename GradientSumT::ValueT> overflow_;
  std::map<int, size_t> overflow_nidx_map_;
  int n_bins_;
  DeviceOrd device_id_;
  static constexpr size_t kNumItemsInGradientSum =
      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
 public:
  // Start with about 16mb
  DeviceHistogramStorage() { data_.reserve(1 << 22); }
  void Init(DeviceOrd device_id, int n_bins) {
    this->n_bins_ = n_bins;
    this->device_id_ = device_id;
  }
  void Reset() {
    auto d_data = data_.data().get();
    dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
    nidx_map_.clear();
    overflow_nidx_map_.clear();
  }
  [[nodiscard]] bool HistogramExists(int nidx) const {
    return nidx_map_.find(nidx) != nidx_map_.cend() ||
           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
  }
  [[nodiscard]] int Bins() const { return n_bins_; }
  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
  void AllocateHistograms(const std::vector<int>& new_nidxs) {
    for (int nidx : new_nidxs) {
      CHECK(!HistogramExists(nidx));
    }
    // Number of items currently used in data
    const size_t used_size = nidx_map_.size() * HistogramSize();
    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
    if (used_size >= kStopGrowingSize) {
      // Use overflow
      // Delete previous entries
      overflow_nidx_map_.clear();
      overflow_.resize(HistogramSize() * new_nidxs.size());
      // Zero memory
      auto d_data = overflow_.data().get();
      dh::LaunchN(overflow_.size(),
                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
      // Append new histograms
      for (int nidx : new_nidxs) {
        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
      }
    } else {
      CHECK_GE(data_.size(), used_size);
      // Expand if necessary
      if (data_.size() < new_used_size) {
        data_.resize(std::max(data_.size() * 2, new_used_size));
      }
      // Append new histograms
      for (int nidx : new_nidxs) {
        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
      }
    }
    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
  }
  /**
   * \summary   Return pointer to histogram memory for a given node.
   * \param nidx    Tree node index.
   * \return    hist pointer.
   */
  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
    CHECK(this->HistogramExists(nidx));
    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
      // Fetch from normal cache
      auto ptr = data_.data().get() + nidx_map_.at(nidx);
      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
    } else {
      // Fetch from overflow
      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
    }
  }
 };
 // Manage memory for a single GPU
 struct GPUHistMakerDevice {
 private:
@ -258,7 +151,7 @@ struct GPUHistMakerDevice {
    // Init histogram
    hist.Init(ctx_->Device(), page->Cuts().TotalBins());
-    hist.Reset();
+    hist.Reset(ctx_);
    this->InitFeatureGroupsOnce();
@ -657,7 +550,7 @@ struct GPUHistMakerDevice {
    all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
    // Allocate the histograms
    // Guaranteed contiguous memory
-    hist.AllocateHistograms(all_new);
+    hist.AllocateHistograms(ctx_, all_new);
    for (auto nidx : hist_nidx) {
      this->BuildHist(nidx);
@ -748,7 +641,7 @@ struct GPUHistMakerDevice {
        ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
    collective::SafeColl(rc);
-    hist.AllocateHistograms({kRootNIdx});
+    hist.AllocateHistograms(ctx_, {kRootNIdx});
    this->BuildHist(kRootNIdx);
    this->AllReduceHist(kRootNIdx, 1);
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {}
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
 std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -526,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 // GPU device ordinal for distributed tests
 std::int32_t DistGpuIdx();
 inline auto GMockThrow(StringView msg) {
  return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
 }
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@ -14,6 +14,46 @@
 #include "../../helpers.h"
 namespace xgboost::tree {
 TEST(Histogram, DeviceHistogramStorage) {
  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
  auto ctx = MakeCUDACtx(0);
  constexpr size_t kNBins = 128;
  constexpr int kNNodes = 4;
  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
  DeviceHistogramStorage<kStopGrowing> histogram;
  histogram.Init(FstCU(), kNBins);
  for (int i = 0; i < kNNodes; ++i) {
    histogram.AllocateHistograms(&ctx, {i});
  }
  histogram.Reset(&ctx);
  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
  // Use allocated memory but do not erase nidx_map.
  for (int i = 0; i < kNNodes; ++i) {
    histogram.AllocateHistograms(&ctx, {i});
  }
  for (int i = 0; i < kNNodes; ++i) {
    ASSERT_TRUE(histogram.HistogramExists(i));
  }
  // Add two new nodes
  histogram.AllocateHistograms(&ctx, {kNNodes});
  histogram.AllocateHistograms(&ctx, {kNNodes + 1});
  // Old cached nodes should still exist
  for (int i = 0; i < kNNodes; ++i) {
    ASSERT_TRUE(histogram.HistogramExists(i));
  }
  // Should be deleted
  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
  // Most recent node should exist
  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
  // Add same node again - should fail
  EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
 }
 void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
  Context ctx = MakeCUDACtx(0);
  size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -6,7 +6,6 @@
 #include <thrust/host_vector.h>
 #include <xgboost/base.h>
 #include <random>
 #include <string>
 #include <vector>
@ -23,46 +22,6 @@
 #include "xgboost/json.h"
 namespace xgboost::tree {
 TEST(GpuHist, DeviceHistogramStorage) {
  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
  dh::safe_cuda(cudaSetDevice(0));
  constexpr size_t kNBins = 128;
  constexpr int kNNodes = 4;
  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
  DeviceHistogramStorage<kStopGrowing> histogram;
  histogram.Init(FstCU(), kNBins);
  for (int i = 0; i < kNNodes; ++i) {
    histogram.AllocateHistograms({i});
  }
  histogram.Reset();
  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
  // Use allocated memory but do not erase nidx_map.
  for (int i = 0; i < kNNodes; ++i) {
    histogram.AllocateHistograms({i});
  }
  for (int i = 0; i < kNNodes; ++i) {
    ASSERT_TRUE(histogram.HistogramExists(i));
  }
  // Add two new nodes
  histogram.AllocateHistograms({kNNodes});
  histogram.AllocateHistograms({kNNodes + 1});
  // Old cached nodes should still exist
  for (int i = 0; i < kNNodes; ++i) {
    ASSERT_TRUE(histogram.HistogramExists(i));
  }
  // Should be deleted
  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
  // Most recent node should exist
  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
  // Add same node again - should fail
  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1}););
 }
 std::vector<GradientPairPrecise> GetHostHistGpair() {
  // 24 bins, 3 bins for each feature (column).
  std::vector<GradientPairPrecise> hist_gpair = {
@ -108,7 +67,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
  maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
-  maker.hist.AllocateHistograms({0});
+  maker.hist.AllocateHistograms(&ctx, {0});
  maker.gpair = gpair.DeviceSpan();
  maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo());
@ -425,8 +384,8 @@ TEST(GpuHist, MaxDepth) {
 namespace {
 RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
  ObjInfo task{ObjInfo::kRegression};
-  GPUHistMaker hist_maker{ctx, &task};
+  std::unique_ptr<TreeUpdater> hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
-  hist_maker.Configure(Args{});
+  hist_maker->Configure(Args{});
  TrainParam param;
  param.UpdateAllowUnknown(Args{});
@ -436,7 +395,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
  std::vector<HostDeviceVector<bst_node_t>> position(1);
  RegTree tree;
-  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+  hist_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                     {&tree});
  return tree;
 }
@ -476,8 +435,8 @@ TEST_F(MGPUHistTest, HistColumnSplit) {
 namespace {
 RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
  ObjInfo task{ObjInfo::kRegression};
-  GPUGlobalApproxMaker approx_maker{ctx, &task};
+  std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)};
-  approx_maker.Configure(Args{});
+  approx_maker->Configure(Args{});
  TrainParam param;
  param.UpdateAllowUnknown(Args{});
@ -487,13 +446,13 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
  std::vector<HostDeviceVector<bst_node_t>> position(1);
  RegTree tree;
-  approx_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+  approx_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                       {&tree});
  return tree;
 }
 void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
-  Context ctx(MakeCUDACtx(GPUIDX));
+  auto ctx = MakeCUDACtx(DistGpuIdx());
  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
  auto const world_size = collective::GetWorldSize();