Move device histogram storage into histogram.cuh. (#10608)
This commit is contained in:
@@ -5,12 +5,14 @@
|
||||
#define HISTOGRAM_CUH_
|
||||
#include <memory> // for unique_ptr
|
||||
|
||||
#include "../../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor
|
||||
#include "feature_groups.cuh" // for FeatureGroupsAccessor
|
||||
#include "xgboost/base.h" // for GradientPair, GradientPairInt64
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/span.h" // for Span
|
||||
#include "../../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../../common/device_helpers.cuh" // for LaunchN
|
||||
#include "../../common/device_vector.cuh" // for device_vector
|
||||
#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor
|
||||
#include "feature_groups.cuh" // for FeatureGroupsAccessor
|
||||
#include "xgboost/base.h" // for GradientPair, GradientPairInt64
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::tree {
|
||||
/**
|
||||
@@ -60,6 +62,111 @@ class GradientQuantiser {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Data storage for node histograms on device. Automatically expands.
|
||||
*
|
||||
* @tparam kStopGrowingSize Do not grow beyond this size
|
||||
*
|
||||
* @author Rory
|
||||
* @date 28/07/2018
|
||||
*/
|
||||
template <size_t kStopGrowingSize = 1 << 28>
|
||||
class DeviceHistogramStorage {
|
||||
private:
|
||||
using GradientSumT = GradientPairInt64;
|
||||
/** @brief Map nidx to starting index of its histogram. */
|
||||
std::map<int, size_t> nidx_map_;
|
||||
// Large buffer of zeroed memory, caches histograms
|
||||
dh::device_vector<typename GradientSumT::ValueT> data_;
|
||||
// If we run out of storage allocate one histogram at a time
|
||||
// in overflow. Not cached, overwritten when a new histogram
|
||||
// is requested
|
||||
dh::device_vector<typename GradientSumT::ValueT> overflow_;
|
||||
std::map<int, size_t> overflow_nidx_map_;
|
||||
int n_bins_;
|
||||
DeviceOrd device_id_;
|
||||
static constexpr size_t kNumItemsInGradientSum =
|
||||
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
|
||||
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
|
||||
|
||||
public:
|
||||
// Start with about 16mb
|
||||
DeviceHistogramStorage() { data_.reserve(1 << 22); }
|
||||
void Init(DeviceOrd device_id, int n_bins) {
|
||||
this->n_bins_ = n_bins;
|
||||
this->device_id_ = device_id;
|
||||
}
|
||||
|
||||
void Reset(Context const* ctx) {
|
||||
auto d_data = data_.data().get();
|
||||
dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
|
||||
[=] __device__(size_t idx) { d_data[idx] = 0.0f; });
|
||||
nidx_map_.clear();
|
||||
overflow_nidx_map_.clear();
|
||||
}
|
||||
[[nodiscard]] bool HistogramExists(int nidx) const {
|
||||
return nidx_map_.find(nidx) != nidx_map_.cend() ||
|
||||
overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
|
||||
}
|
||||
[[nodiscard]] int Bins() const { return n_bins_; }
|
||||
[[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
|
||||
dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
|
||||
|
||||
void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
|
||||
for (int nidx : new_nidxs) {
|
||||
CHECK(!HistogramExists(nidx));
|
||||
}
|
||||
// Number of items currently used in data
|
||||
const size_t used_size = nidx_map_.size() * HistogramSize();
|
||||
const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
|
||||
if (used_size >= kStopGrowingSize) {
|
||||
// Use overflow
|
||||
// Delete previous entries
|
||||
overflow_nidx_map_.clear();
|
||||
overflow_.resize(HistogramSize() * new_nidxs.size());
|
||||
// Zero memory
|
||||
auto d_data = overflow_.data().get();
|
||||
dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(),
|
||||
[=] __device__(size_t idx) { d_data[idx] = 0.0; });
|
||||
// Append new histograms
|
||||
for (int nidx : new_nidxs) {
|
||||
overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
|
||||
}
|
||||
} else {
|
||||
CHECK_GE(data_.size(), used_size);
|
||||
// Expand if necessary
|
||||
if (data_.size() < new_used_size) {
|
||||
data_.resize(std::max(data_.size() * 2, new_used_size));
|
||||
}
|
||||
// Append new histograms
|
||||
for (int nidx : new_nidxs) {
|
||||
nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
|
||||
}
|
||||
|
||||
/**
|
||||
* \summary Return pointer to histogram memory for a given node.
|
||||
* \param nidx Tree node index.
|
||||
* \return hist pointer.
|
||||
*/
|
||||
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
|
||||
CHECK(this->HistogramExists(nidx));
|
||||
|
||||
if (nidx_map_.find(nidx) != nidx_map_.cend()) {
|
||||
// Fetch from normal cache
|
||||
auto ptr = data_.data().get() + nidx_map_.at(nidx);
|
||||
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
|
||||
} else {
|
||||
// Fetch from overflow
|
||||
auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
|
||||
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceHistogramBuilderImpl;
|
||||
|
||||
class DeviceHistogramBuilder {
|
||||
|
||||
@@ -49,113 +49,6 @@ namespace xgboost::tree {
|
||||
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
#endif // !defined(GTEST_TEST)
|
||||
|
||||
/**
|
||||
* \struct DeviceHistogramStorage
|
||||
*
|
||||
* \summary Data storage for node histograms on device. Automatically expands.
|
||||
*
|
||||
* \tparam GradientSumT histogram entry type.
|
||||
* \tparam kStopGrowingSize Do not grow beyond this size
|
||||
*
|
||||
* \author Rory
|
||||
* \date 28/07/2018
|
||||
*/
|
||||
template <size_t kStopGrowingSize = 1 << 28>
|
||||
class DeviceHistogramStorage {
|
||||
private:
|
||||
using GradientSumT = GradientPairInt64;
|
||||
/*! \brief Map nidx to starting index of its histogram. */
|
||||
std::map<int, size_t> nidx_map_;
|
||||
// Large buffer of zeroed memory, caches histograms
|
||||
dh::device_vector<typename GradientSumT::ValueT> data_;
|
||||
// If we run out of storage allocate one histogram at a time
|
||||
// in overflow. Not cached, overwritten when a new histogram
|
||||
// is requested
|
||||
dh::device_vector<typename GradientSumT::ValueT> overflow_;
|
||||
std::map<int, size_t> overflow_nidx_map_;
|
||||
int n_bins_;
|
||||
DeviceOrd device_id_;
|
||||
static constexpr size_t kNumItemsInGradientSum =
|
||||
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
|
||||
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
|
||||
|
||||
public:
|
||||
// Start with about 16mb
|
||||
DeviceHistogramStorage() { data_.reserve(1 << 22); }
|
||||
void Init(DeviceOrd device_id, int n_bins) {
|
||||
this->n_bins_ = n_bins;
|
||||
this->device_id_ = device_id;
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
auto d_data = data_.data().get();
|
||||
dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
|
||||
nidx_map_.clear();
|
||||
overflow_nidx_map_.clear();
|
||||
}
|
||||
[[nodiscard]] bool HistogramExists(int nidx) const {
|
||||
return nidx_map_.find(nidx) != nidx_map_.cend() ||
|
||||
overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
|
||||
}
|
||||
[[nodiscard]] int Bins() const { return n_bins_; }
|
||||
[[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
|
||||
dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
|
||||
|
||||
void AllocateHistograms(const std::vector<int>& new_nidxs) {
|
||||
for (int nidx : new_nidxs) {
|
||||
CHECK(!HistogramExists(nidx));
|
||||
}
|
||||
// Number of items currently used in data
|
||||
const size_t used_size = nidx_map_.size() * HistogramSize();
|
||||
const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
|
||||
if (used_size >= kStopGrowingSize) {
|
||||
// Use overflow
|
||||
// Delete previous entries
|
||||
overflow_nidx_map_.clear();
|
||||
overflow_.resize(HistogramSize() * new_nidxs.size());
|
||||
// Zero memory
|
||||
auto d_data = overflow_.data().get();
|
||||
dh::LaunchN(overflow_.size(),
|
||||
[=] __device__(size_t idx) { d_data[idx] = 0.0; });
|
||||
// Append new histograms
|
||||
for (int nidx : new_nidxs) {
|
||||
overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
|
||||
}
|
||||
} else {
|
||||
CHECK_GE(data_.size(), used_size);
|
||||
// Expand if necessary
|
||||
if (data_.size() < new_used_size) {
|
||||
data_.resize(std::max(data_.size() * 2, new_used_size));
|
||||
}
|
||||
// Append new histograms
|
||||
for (int nidx : new_nidxs) {
|
||||
nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
|
||||
}
|
||||
|
||||
/**
|
||||
* \summary Return pointer to histogram memory for a given node.
|
||||
* \param nidx Tree node index.
|
||||
* \return hist pointer.
|
||||
*/
|
||||
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
|
||||
CHECK(this->HistogramExists(nidx));
|
||||
|
||||
if (nidx_map_.find(nidx) != nidx_map_.cend()) {
|
||||
// Fetch from normal cache
|
||||
auto ptr = data_.data().get() + nidx_map_.at(nidx);
|
||||
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
|
||||
} else {
|
||||
// Fetch from overflow
|
||||
auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
|
||||
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Manage memory for a single GPU
|
||||
struct GPUHistMakerDevice {
|
||||
private:
|
||||
@@ -258,7 +151,7 @@ struct GPUHistMakerDevice {
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->Device(), page->Cuts().TotalBins());
|
||||
hist.Reset();
|
||||
hist.Reset(ctx_);
|
||||
|
||||
this->InitFeatureGroupsOnce();
|
||||
|
||||
@@ -657,7 +550,7 @@ struct GPUHistMakerDevice {
|
||||
all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
|
||||
// Allocate the histograms
|
||||
// Guaranteed contiguous memory
|
||||
hist.AllocateHistograms(all_new);
|
||||
hist.AllocateHistograms(ctx_, all_new);
|
||||
|
||||
for (auto nidx : hist_nidx) {
|
||||
this->BuildHist(nidx);
|
||||
@@ -748,7 +641,7 @@ struct GPUHistMakerDevice {
|
||||
ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
|
||||
collective::SafeColl(rc);
|
||||
|
||||
hist.AllocateHistograms({kRootNIdx});
|
||||
hist.AllocateHistograms(ctx_, {kRootNIdx});
|
||||
this->BuildHist(kRootNIdx);
|
||||
this->AllReduceHist(kRootNIdx, 1);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user