Support optimal partitioning for GPU hist. (#7652)
* Implement `MaxCategory` in quantile. * Implement partition-based split for GPU evaluation. Currently, it's based on the existing evaluation function. * Extract an evaluator from GPU Hist to store the needed states. * Added some CUDA stream/event utilities. * Update document with references. * Fixed a bug in approx evaluator where the number of data points is less than the number of categories.
This commit is contained in:
@@ -16,6 +16,10 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
using CatBitField = LBitField32;
|
||||
using KCatBitField = CLBitField32;
|
||||
|
||||
// Cast the categorical type.
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE bst_cat_t AsCat(T const& v) {
|
||||
@@ -57,6 +61,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat
|
||||
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
|
||||
return dft_left;
|
||||
}
|
||||
|
||||
auto pos = KCatBitField::ToBitPos(cat);
|
||||
if (pos.int_pos >= cats.size()) {
|
||||
return true;
|
||||
}
|
||||
return !s_cats.Check(AsCat(cat));
|
||||
}
|
||||
|
||||
@@ -73,18 +82,14 @@ inline void InvalidCategory() {
|
||||
/*!
|
||||
* \brief Whether should we use onehot encoding for categorical data.
|
||||
*/
|
||||
inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
|
||||
bool use_one_hot = n_cats < max_cat_to_onehot ||
|
||||
(task.task != ObjInfo::kRegression && task.task != ObjInfo::kBinary);
|
||||
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
|
||||
bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
|
||||
return use_one_hot;
|
||||
}
|
||||
|
||||
struct IsCatOp {
|
||||
XGBOOST_DEVICE bool operator()(FeatureType ft) { return ft == FeatureType::kCategorical; }
|
||||
};
|
||||
|
||||
using CatBitField = LBitField32;
|
||||
using KCatBitField = CLBitField32;
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -952,22 +952,22 @@ thrust::device_ptr<T const> tcend(xgboost::HostDeviceVector<T> const& vector) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) { // NOLINT
|
||||
return thrust::device_ptr<T>(span.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return thrust::device_ptr<T>(span.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) { // NOLINT
|
||||
return tbegin(span) + span.size();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return tbegin(span) + span.size();
|
||||
}
|
||||
|
||||
@@ -982,12 +982,12 @@ XGBOOST_DEVICE auto trend(xgboost::common::Span<T> &span) { // NOLINT
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return thrust::device_ptr<T const>(span.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return tcbegin(span) + span.size();
|
||||
}
|
||||
|
||||
@@ -1536,4 +1536,69 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
|
||||
safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
|
||||
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
class CUDAStreamView;
|
||||
|
||||
class CUDAEvent {
|
||||
cudaEvent_t event_{nullptr};
|
||||
|
||||
public:
|
||||
CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
|
||||
~CUDAEvent() {
|
||||
if (event_) {
|
||||
dh::safe_cuda(cudaEventDestroy(event_));
|
||||
}
|
||||
}
|
||||
|
||||
CUDAEvent(CUDAEvent const &that) = delete;
|
||||
CUDAEvent &operator=(CUDAEvent const &that) = delete;
|
||||
|
||||
inline void Record(CUDAStreamView stream); // NOLINT
|
||||
|
||||
operator cudaEvent_t() const { return event_; } // NOLINT
|
||||
};
|
||||
|
||||
class CUDAStreamView {
|
||||
cudaStream_t stream_{nullptr};
|
||||
|
||||
public:
|
||||
explicit CUDAStreamView(cudaStream_t s) : stream_{s} {}
|
||||
void Wait(CUDAEvent const &e) {
|
||||
#if defined(__CUDACC_VER_MAJOR__)
|
||||
#if __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0
|
||||
// CUDA == 11.0
|
||||
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, 0));
|
||||
#else
|
||||
// CUDA > 11.0
|
||||
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
|
||||
#endif // __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0:
|
||||
#else // clang
|
||||
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
|
||||
#endif // defined(__CUDACC_VER_MAJOR__)
|
||||
}
|
||||
operator cudaStream_t() const { // NOLINT
|
||||
return stream_;
|
||||
}
|
||||
void Sync() { dh::safe_cuda(cudaStreamSynchronize(stream_)); }
|
||||
};
|
||||
|
||||
inline void CUDAEvent::Record(CUDAStreamView stream) { // NOLINT
|
||||
dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
|
||||
}
|
||||
|
||||
inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
|
||||
|
||||
class CUDAStream {
|
||||
cudaStream_t stream_;
|
||||
|
||||
public:
|
||||
CUDAStream() {
|
||||
dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
|
||||
}
|
||||
~CUDAStream() {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream_));
|
||||
}
|
||||
|
||||
CUDAStreamView View() const { return CUDAStreamView{stream_}; }
|
||||
};
|
||||
} // namespace dh
|
||||
|
||||
@@ -33,66 +33,84 @@ namespace common {
|
||||
*/
|
||||
using GHistIndexRow = Span<uint32_t const>;
|
||||
|
||||
// A CSC matrix representing histogram cuts, used in CPU quantile hist.
|
||||
// A CSC matrix representing histogram cuts.
|
||||
// The cut values represent upper bounds of bins containing approximately equal numbers of elements
|
||||
class HistogramCuts {
|
||||
bool has_categorical_{false};
|
||||
float max_cat_{-1.0f};
|
||||
|
||||
protected:
|
||||
using BinIdx = uint32_t;
|
||||
|
||||
void Swap(HistogramCuts&& that) noexcept(true) {
|
||||
std::swap(cut_values_, that.cut_values_);
|
||||
std::swap(cut_ptrs_, that.cut_ptrs_);
|
||||
std::swap(min_vals_, that.min_vals_);
|
||||
|
||||
std::swap(has_categorical_, that.has_categorical_);
|
||||
std::swap(max_cat_, that.max_cat_);
|
||||
}
|
||||
|
||||
void Copy(HistogramCuts const& that) {
|
||||
cut_values_.Resize(that.cut_values_.Size());
|
||||
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||
min_vals_.Resize(that.min_vals_.Size());
|
||||
cut_values_.Copy(that.cut_values_);
|
||||
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||
min_vals_.Copy(that.min_vals_);
|
||||
has_categorical_ = that.has_categorical_;
|
||||
max_cat_ = that.max_cat_;
|
||||
}
|
||||
|
||||
public:
|
||||
HostDeviceVector<bst_float> cut_values_; // NOLINT
|
||||
HostDeviceVector<uint32_t> cut_ptrs_; // NOLINT
|
||||
HostDeviceVector<float> cut_values_; // NOLINT
|
||||
HostDeviceVector<uint32_t> cut_ptrs_; // NOLINT
|
||||
// storing minimum value in a sketch set.
|
||||
HostDeviceVector<float> min_vals_; // NOLINT
|
||||
|
||||
HistogramCuts();
|
||||
HistogramCuts(HistogramCuts const& that) {
|
||||
cut_values_.Resize(that.cut_values_.Size());
|
||||
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||
min_vals_.Resize(that.min_vals_.Size());
|
||||
cut_values_.Copy(that.cut_values_);
|
||||
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||
min_vals_.Copy(that.min_vals_);
|
||||
}
|
||||
HistogramCuts(HistogramCuts const& that) { this->Copy(that); }
|
||||
|
||||
HistogramCuts(HistogramCuts&& that) noexcept(true) {
|
||||
*this = std::forward<HistogramCuts&&>(that);
|
||||
this->Swap(std::forward<HistogramCuts>(that));
|
||||
}
|
||||
|
||||
HistogramCuts& operator=(HistogramCuts const& that) {
|
||||
cut_values_.Resize(that.cut_values_.Size());
|
||||
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||
min_vals_.Resize(that.min_vals_.Size());
|
||||
cut_values_.Copy(that.cut_values_);
|
||||
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||
min_vals_.Copy(that.min_vals_);
|
||||
this->Copy(that);
|
||||
return *this;
|
||||
}
|
||||
|
||||
HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
|
||||
cut_ptrs_ = std::move(that.cut_ptrs_);
|
||||
cut_values_ = std::move(that.cut_values_);
|
||||
min_vals_ = std::move(that.min_vals_);
|
||||
this->Swap(std::forward<HistogramCuts>(that));
|
||||
return *this;
|
||||
}
|
||||
|
||||
uint32_t FeatureBins(uint32_t feature) const {
|
||||
return cut_ptrs_.ConstHostVector().at(feature + 1) -
|
||||
cut_ptrs_.ConstHostVector()[feature];
|
||||
uint32_t FeatureBins(bst_feature_t feature) const {
|
||||
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
|
||||
}
|
||||
|
||||
// Getters. Cuts should be of no use after building histogram indices, but currently
|
||||
// they are deeply linked with quantile_hist, gpu sketcher and gpu_hist, so we preserve
|
||||
// these for now.
|
||||
std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
|
||||
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
|
||||
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
|
||||
|
||||
bool HasCategorical() const { return has_categorical_; }
|
||||
float MaxCategory() const { return max_cat_; }
|
||||
/**
|
||||
* \brief Set meta info about categorical features.
|
||||
*
|
||||
* \param has_cat Do we have categorical feature in the data?
|
||||
* \param max_cat The maximum categorical value in all features.
|
||||
*/
|
||||
void SetCategorical(bool has_cat, float max_cat) {
|
||||
has_categorical_ = has_cat;
|
||||
max_cat_ = max_cat;
|
||||
}
|
||||
|
||||
size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
|
||||
|
||||
// Return the index of a cut point that is strictly greater than the input
|
||||
// value, or the last available index if none exists
|
||||
BinIdx SearchBin(float value, uint32_t column_id, std::vector<uint32_t> const& ptrs,
|
||||
BinIdx SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
|
||||
std::vector<float> const& values) const {
|
||||
auto end = ptrs[column_id + 1];
|
||||
auto beg = ptrs[column_id];
|
||||
@@ -102,7 +120,7 @@ class HistogramCuts {
|
||||
return idx;
|
||||
}
|
||||
|
||||
BinIdx SearchBin(float value, uint32_t column_id) const {
|
||||
BinIdx SearchBin(float value, bst_feature_t column_id) const {
|
||||
return this->SearchBin(value, column_id, Ptrs(), Values());
|
||||
}
|
||||
|
||||
|
||||
@@ -272,7 +272,7 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
|
||||
// move all categories into a flatten vector to prepare for allreduce
|
||||
size_t total = feature_ptr.back();
|
||||
std::vector<bst_cat_t> flatten(total, 0);
|
||||
std::vector<float> flatten(total, 0);
|
||||
auto cursor{flatten.begin()};
|
||||
for (auto const &feat : categories) {
|
||||
cursor = std::copy(feat.cbegin(), feat.cend(), cursor);
|
||||
@@ -287,15 +287,15 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
auto gtotal = global_worker_ptr.back();
|
||||
|
||||
// categories in all workers with all features.
|
||||
std::vector<bst_cat_t> global_categories(gtotal, 0);
|
||||
std::vector<float> global_categories(gtotal, 0);
|
||||
auto rank_begin = global_worker_ptr[rank];
|
||||
auto rank_size = global_worker_ptr[rank + 1] - rank_begin;
|
||||
CHECK_EQ(rank_size, total);
|
||||
std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
|
||||
// gather values from all workers.
|
||||
rabit::Allreduce<rabit::op::Sum>(global_categories.data(), global_categories.size());
|
||||
QuantileAllreduce<bst_cat_t> allreduce_result{global_categories, global_worker_ptr,
|
||||
global_feat_ptrs, categories.size()};
|
||||
QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
|
||||
categories.size()};
|
||||
ParallelFor(categories.size(), n_threads, [&](auto fidx) {
|
||||
if (!IsCat(feature_types, fidx)) {
|
||||
return;
|
||||
@@ -531,6 +531,22 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
|
||||
InvalidCategory();
|
||||
}
|
||||
}
|
||||
auto const &ptrs = cuts->Ptrs();
|
||||
auto const &vals = cuts->Values();
|
||||
|
||||
float max_cat{-std::numeric_limits<float>::infinity()};
|
||||
for (size_t i = 1; i < ptrs.size(); ++i) {
|
||||
if (IsCat(feature_types_, i - 1)) {
|
||||
auto beg = ptrs[i - 1];
|
||||
auto end = ptrs[i];
|
||||
auto feat = Span<float const>{vals}.subspan(beg, end - beg);
|
||||
auto max_elem = *std::max_element(feat.cbegin(), feat.cend());
|
||||
if (max_elem > max_cat) {
|
||||
max_cat = max_elem;
|
||||
}
|
||||
}
|
||||
}
|
||||
cuts->SetCategorical(true, max_cat);
|
||||
}
|
||||
|
||||
monitor_.Stop(__func__);
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
/*!
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/binary_search.h>
|
||||
#include <thrust/transform_scan.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/transform_scan.h>
|
||||
#include <thrust/unique.h>
|
||||
|
||||
#include <limits> // std::numeric_limits
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "quantile.h"
|
||||
#include "quantile.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "categorical.h"
|
||||
#include "common.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "quantile.cuh"
|
||||
#include "quantile.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -586,7 +587,7 @@ struct InvalidCatOp {
|
||||
Span<uint32_t const> ptrs;
|
||||
Span<FeatureType const> ft;
|
||||
|
||||
XGBOOST_DEVICE bool operator()(size_t i) {
|
||||
XGBOOST_DEVICE bool operator()(size_t i) const {
|
||||
auto fidx = dh::SegmentId(ptrs, i);
|
||||
return IsCat(ft, fidx) && InvalidCat(values[i]);
|
||||
}
|
||||
@@ -683,18 +684,36 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
|
||||
out_column[idx] = in_column[idx+1].value;
|
||||
});
|
||||
|
||||
float max_cat{-1.0f};
|
||||
if (has_categorical_) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
auto ptrs = p_cuts->cut_ptrs_.ConstDeviceSpan();
|
||||
auto it = thrust::make_counting_iterator(0ul);
|
||||
auto invalid_op = InvalidCatOp{out_cut_values, d_out_columns_ptr, d_ft};
|
||||
auto it = dh::MakeTransformIterator<thrust::pair<bool, float>>(
|
||||
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
|
||||
auto fidx = dh::SegmentId(d_out_columns_ptr, i);
|
||||
if (IsCat(d_ft, fidx)) {
|
||||
auto invalid = invalid_op(i);
|
||||
auto v = out_cut_values[i];
|
||||
return thrust::make_pair(invalid, v);
|
||||
}
|
||||
return thrust::make_pair(false, std::numeric_limits<float>::min());
|
||||
});
|
||||
|
||||
CHECK_EQ(p_cuts->Ptrs().back(), out_cut_values.size());
|
||||
auto invalid = thrust::any_of(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
|
||||
InvalidCatOp{out_cut_values, ptrs, d_ft});
|
||||
bool invalid{false};
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
thrust::tie(invalid, max_cat) =
|
||||
thrust::reduce(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
|
||||
thrust::make_pair(false, std::numeric_limits<float>::min()),
|
||||
[=] XGBOOST_DEVICE(thrust::pair<bool, bst_cat_t> const &l,
|
||||
thrust::pair<bool, bst_cat_t> const &r) {
|
||||
return thrust::make_pair(l.first || r.first, std::max(l.second, r.second));
|
||||
});
|
||||
if (invalid) {
|
||||
InvalidCategory();
|
||||
}
|
||||
}
|
||||
|
||||
p_cuts->SetCategorical(this->has_categorical_, max_cat);
|
||||
|
||||
timer_.Stop(__func__);
|
||||
}
|
||||
} // namespace common
|
||||
|
||||
@@ -1,9 +1,14 @@
|
||||
/*!
|
||||
* Copyright 2020-2021 by XGBoost Contributors
|
||||
* Copyright 2020-2022 by XGBoost Contributors
|
||||
*/
|
||||
#include <algorithm> // std::max
|
||||
#include <limits>
|
||||
#include "evaluate_splits.cuh"
|
||||
|
||||
#include "../../common/categorical.h"
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "expand_entry.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -23,7 +28,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan,
|
||||
float missing_right_gain = evaluator.CalcSplitGain(
|
||||
param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan));
|
||||
|
||||
if (missing_left_gain >= missing_right_gain) {
|
||||
if (missing_left_gain > missing_right_gain) {
|
||||
missing_left_out = true;
|
||||
return missing_left_gain - parent_gain;
|
||||
} else {
|
||||
@@ -69,108 +74,61 @@ ReduceFeature(common::Span<const GradientSumT> feature_histogram,
|
||||
return shared_sum;
|
||||
}
|
||||
|
||||
template <typename GradientSumT, typename TempStorageT> struct OneHotBin {
|
||||
GradientSumT __device__ operator()(bool thread_active, uint32_t scan_begin,
|
||||
SumCallbackOp<GradientSumT> *,
|
||||
GradientPairPrecise const &missing,
|
||||
EvaluateSplitInputs<GradientSumT> const &inputs,
|
||||
TempStorageT *) {
|
||||
GradientSumT bin = thread_active
|
||||
? inputs.gradient_histogram[scan_begin + threadIdx.x]
|
||||
: GradientSumT();
|
||||
auto rest = inputs.parent_sum - GradientPairPrecise(bin) - missing;
|
||||
return GradientSumT{rest};
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
struct UpdateOneHot {
|
||||
void __device__ operator()(bool missing_left, uint32_t scan_begin, float gain,
|
||||
bst_feature_t fidx, GradientPairPrecise const &missing,
|
||||
GradientSumT const &bin,
|
||||
EvaluateSplitInputs<GradientSumT> const &inputs,
|
||||
DeviceSplitCandidate *best_split) {
|
||||
int split_gidx = (scan_begin + threadIdx.x);
|
||||
float fvalue = inputs.feature_values[split_gidx];
|
||||
GradientPairPrecise left =
|
||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||
GradientPairPrecise right = inputs.parent_sum - left;
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, true,
|
||||
inputs.param);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GradientSumT, typename TempStorageT, typename ScanT>
|
||||
struct NumericBin {
|
||||
GradientSumT __device__ operator()(bool thread_active, uint32_t scan_begin,
|
||||
SumCallbackOp<GradientSumT> *prefix_callback,
|
||||
GradientPairPrecise const &missing,
|
||||
EvaluateSplitInputs<GradientSumT> inputs,
|
||||
TempStorageT *temp_storage) {
|
||||
GradientSumT bin = thread_active
|
||||
? inputs.gradient_histogram[scan_begin + threadIdx.x]
|
||||
: GradientSumT();
|
||||
ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), *prefix_callback);
|
||||
return bin;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
struct UpdateNumeric {
|
||||
void __device__ operator()(bool missing_left, uint32_t scan_begin, float gain,
|
||||
bst_feature_t fidx, GradientPairPrecise const &missing,
|
||||
GradientSumT const &bin,
|
||||
EvaluateSplitInputs<GradientSumT> const &inputs,
|
||||
DeviceSplitCandidate *best_split) {
|
||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||
uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin
|
||||
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
||||
float fvalue;
|
||||
if (split_gidx < static_cast<int>(gidx_begin)) {
|
||||
fvalue = inputs.min_fvalue[fidx];
|
||||
} else {
|
||||
fvalue = inputs.feature_values[split_gidx];
|
||||
}
|
||||
GradientPairPrecise left =
|
||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||
GradientPairPrecise right = inputs.parent_sum - left;
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, false,
|
||||
inputs.param);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Find the thread with best gain. */
|
||||
template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
|
||||
typename MaxReduceT, typename TempStorageT, typename GradientSumT,
|
||||
typename BinFn, typename UpdateFn>
|
||||
template <int BLOCK_THREADS, typename ReduceT, typename ScanT, typename MaxReduceT,
|
||||
typename TempStorageT, typename GradientSumT, SplitType type>
|
||||
__device__ void EvaluateFeature(
|
||||
int fidx, EvaluateSplitInputs<GradientSumT> inputs,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
DeviceSplitCandidate* best_split, // shared memory storing best split
|
||||
TempStorageT* temp_storage // temp memory for cub operations
|
||||
common::Span<bst_feature_t> sorted_idx, size_t offset,
|
||||
DeviceSplitCandidate *best_split, // shared memory storing best split
|
||||
TempStorageT *temp_storage // temp memory for cub operations
|
||||
) {
|
||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||
uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin
|
||||
uint32_t gidx_end =
|
||||
inputs.feature_segments[fidx + 1]; // end bin for i^th feature
|
||||
auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin);
|
||||
auto bin_fn = BinFn();
|
||||
auto update_fn = UpdateFn();
|
||||
|
||||
// Sum histogram bins for current feature
|
||||
GradientSumT const feature_sum =
|
||||
ReduceFeature<BLOCK_THREADS, ReduceT, TempStorageT, GradientSumT>(
|
||||
feature_hist, temp_storage);
|
||||
ReduceFeature<BLOCK_THREADS, ReduceT, TempStorageT, GradientSumT>(feature_hist, temp_storage);
|
||||
|
||||
GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum};
|
||||
float const null_gain = -std::numeric_limits<bst_float>::infinity();
|
||||
|
||||
SumCallbackOp<GradientSumT> prefix_op = SumCallbackOp<GradientSumT>();
|
||||
for (int scan_begin = gidx_begin; scan_begin < gidx_end;
|
||||
scan_begin += BLOCK_THREADS) {
|
||||
for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) {
|
||||
bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
|
||||
auto bin = bin_fn(thread_active, scan_begin, &prefix_op, missing, inputs, temp_storage);
|
||||
|
||||
auto calc_bin_value = [&]() {
|
||||
GradientSumT bin;
|
||||
switch (type) {
|
||||
case kOneHot: {
|
||||
auto rest =
|
||||
thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT();
|
||||
bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT
|
||||
break;
|
||||
}
|
||||
case kNum: {
|
||||
bin =
|
||||
thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT();
|
||||
ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op);
|
||||
break;
|
||||
}
|
||||
case kPart: {
|
||||
auto rest = thread_active
|
||||
? inputs.gradient_histogram[sorted_idx[scan_begin + threadIdx.x] - offset]
|
||||
: GradientSumT();
|
||||
// No min value for cat feature, use inclusive scan.
|
||||
ScanT(temp_storage->scan).InclusiveScan(rest, rest, cub::Sum(), prefix_op);
|
||||
bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT
|
||||
break;
|
||||
}
|
||||
}
|
||||
return bin;
|
||||
};
|
||||
auto bin = calc_bin_value();
|
||||
// Whether the gradient of missing values is put to the left side.
|
||||
bool missing_left = true;
|
||||
float gain = null_gain;
|
||||
@@ -193,10 +151,48 @@ __device__ void EvaluateFeature(
|
||||
|
||||
cub::CTA_SYNC();
|
||||
|
||||
// Best thread updates split
|
||||
// Best thread updates the split
|
||||
if (threadIdx.x == block_max.key) {
|
||||
update_fn(missing_left, scan_begin, gain, fidx, missing, bin, inputs,
|
||||
best_split);
|
||||
switch (type) {
|
||||
case kNum: {
|
||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||
uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin
|
||||
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
||||
float fvalue;
|
||||
if (split_gidx < static_cast<int>(gidx_begin)) {
|
||||
fvalue = inputs.min_fvalue[fidx];
|
||||
} else {
|
||||
fvalue = inputs.feature_values[split_gidx];
|
||||
}
|
||||
GradientPairPrecise left =
|
||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||
GradientPairPrecise right = inputs.parent_sum - left;
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
||||
false, inputs.param);
|
||||
break;
|
||||
}
|
||||
case kOneHot: {
|
||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||
float fvalue = inputs.feature_values[split_gidx];
|
||||
GradientPairPrecise left =
|
||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||
GradientPairPrecise right = inputs.parent_sum - left;
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
||||
true, inputs.param);
|
||||
break;
|
||||
}
|
||||
case kPart: {
|
||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||
float fvalue = inputs.feature_values[split_gidx];
|
||||
GradientPairPrecise left =
|
||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||
GradientPairPrecise right = inputs.parent_sum - left;
|
||||
auto best_thresh = block_max.key; // index of best threshold inside a feature.
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left,
|
||||
right, true, inputs.param);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
cub::CTA_SYNC();
|
||||
}
|
||||
@@ -206,6 +202,8 @@ template <int BLOCK_THREADS, typename GradientSumT>
|
||||
__global__ void EvaluateSplitsKernel(
|
||||
EvaluateSplitInputs<GradientSumT> left,
|
||||
EvaluateSplitInputs<GradientSumT> right,
|
||||
ObjInfo task,
|
||||
common::Span<bst_feature_t> sorted_idx,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
common::Span<DeviceSplitCandidate> out_candidates) {
|
||||
// KeyValuePair here used as threadIdx.x -> gain_value
|
||||
@@ -240,22 +238,26 @@ __global__ void EvaluateSplitsKernel(
|
||||
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
||||
int fidx = inputs.feature_set[is_left ? blockIdx.x
|
||||
: blockIdx.x - left.feature_set.size()];
|
||||
|
||||
if (common::IsCat(inputs.feature_types, fidx)) {
|
||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT,
|
||||
TempStorage, GradientSumT,
|
||||
OneHotBin<GradientSumT, TempStorage>,
|
||||
UpdateOneHot<GradientSumT>>(fidx, inputs, evaluator, &best_split,
|
||||
&temp_storage);
|
||||
auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx];
|
||||
if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot, task)) {
|
||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
||||
kOneHot>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
|
||||
} else {
|
||||
auto node_sorted_idx = is_left ? sorted_idx.first(inputs.feature_values.size())
|
||||
: sorted_idx.last(inputs.feature_values.size());
|
||||
size_t offset = is_left ? 0 : inputs.feature_values.size();
|
||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
||||
kPart>(fidx, inputs, evaluator, node_sorted_idx, offset, &best_split,
|
||||
&temp_storage);
|
||||
}
|
||||
} else {
|
||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT,
|
||||
TempStorage, GradientSumT,
|
||||
NumericBin<GradientSumT, TempStorage, BlockScanT>,
|
||||
UpdateNumeric<GradientSumT>>(fidx, inputs, evaluator, &best_split,
|
||||
&temp_storage);
|
||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
||||
kNum>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
|
||||
}
|
||||
|
||||
cub::CTA_SYNC();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
// Record best loss for each feature
|
||||
out_candidates[blockIdx.x] = best_split;
|
||||
@@ -267,71 +269,175 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,
|
||||
return b.loss_chg > a.loss_chg ? b : a;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Set the bits for categorical splits based on the split threshold.
|
||||
*/
|
||||
template <typename GradientSumT>
|
||||
void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientSumT> left,
|
||||
EvaluateSplitInputs<GradientSumT> right) {
|
||||
size_t combined_num_features =
|
||||
left.feature_set.size() + right.feature_set.size();
|
||||
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
|
||||
combined_num_features);
|
||||
__device__ void SortBasedSplit(EvaluateSplitInputs<GradientSumT> const &input,
|
||||
common::Span<bst_feature_t const> d_sorted_idx, bst_feature_t fidx,
|
||||
bool is_left, common::Span<common::CatBitField::value_type> out,
|
||||
DeviceSplitCandidate *p_out_split) {
|
||||
auto &out_split = *p_out_split;
|
||||
out_split.split_cats = common::CatBitField{out};
|
||||
auto node_sorted_idx =
|
||||
is_left ? d_sorted_idx.subspan(0, input.feature_values.size())
|
||||
: d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size());
|
||||
size_t node_offset = is_left ? 0 : input.feature_values.size();
|
||||
auto best_thresh = out_split.PopBestThresh();
|
||||
auto f_sorted_idx =
|
||||
node_sorted_idx.subspan(input.feature_segments[fidx], input.FeatureBins(fidx));
|
||||
if (out_split.dir != kLeftDir) {
|
||||
// forward, missing on right
|
||||
auto beg = dh::tcbegin(f_sorted_idx);
|
||||
// Don't put all the categories into one side
|
||||
auto boundary = std::min(static_cast<size_t>((best_thresh + 1)), (f_sorted_idx.size() - 1));
|
||||
boundary = std::max(boundary, static_cast<size_t>(1ul));
|
||||
auto end = beg + boundary;
|
||||
thrust::for_each(thrust::seq, beg, end, [&](auto c) {
|
||||
auto cat = input.feature_values[c - node_offset];
|
||||
assert(!out_split.split_cats.Check(cat) && "already set");
|
||||
out_split.SetCat(cat);
|
||||
});
|
||||
} else {
|
||||
assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
|
||||
thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
|
||||
dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
|
||||
auto cat = input.feature_values[c - node_offset];
|
||||
out_split.SetCat(cat);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
|
||||
EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
common::Span<DeviceSplitCandidate> out_splits) {
|
||||
if (!split_cats_.empty()) {
|
||||
this->SortHistogram(left, right, evaluator);
|
||||
}
|
||||
|
||||
size_t combined_num_features = left.feature_set.size() + right.feature_set.size();
|
||||
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
|
||||
|
||||
// One block for each feature
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
dh::LaunchKernel {uint32_t(combined_num_features), kBlockThreads, 0}(
|
||||
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, evaluator,
|
||||
dh::ToSpan(feature_best_splits));
|
||||
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
|
||||
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, task, this->SortedIdx(left),
|
||||
evaluator, dh::ToSpan(feature_best_splits));
|
||||
|
||||
// Reduce to get best candidate for left and right child over all features
|
||||
auto reduce_offset =
|
||||
dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
|
||||
[=] __device__(size_t idx) -> size_t {
|
||||
if (idx == 0) {
|
||||
return 0;
|
||||
}
|
||||
if (idx == 1) {
|
||||
return left.feature_set.size();
|
||||
}
|
||||
if (idx == 2) {
|
||||
return combined_num_features;
|
||||
}
|
||||
return 0;
|
||||
});
|
||||
auto reduce_offset = dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
|
||||
[=] __device__(size_t idx) -> size_t {
|
||||
if (idx == 0) {
|
||||
return 0;
|
||||
}
|
||||
if (idx == 1) {
|
||||
return left.feature_set.size();
|
||||
}
|
||||
if (idx == 2) {
|
||||
return combined_num_features;
|
||||
}
|
||||
return 0;
|
||||
});
|
||||
size_t temp_storage_bytes = 0;
|
||||
auto num_segments = out_splits.size();
|
||||
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes,
|
||||
feature_best_splits.data(), out_splits.data(),
|
||||
num_segments, reduce_offset, reduce_offset + 1);
|
||||
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
||||
out_splits.data(), num_segments, reduce_offset,
|
||||
reduce_offset + 1);
|
||||
dh::TemporaryArray<int8_t> temp(temp_storage_bytes);
|
||||
cub::DeviceSegmentedReduce::Sum(temp.data().get(), temp_storage_bytes,
|
||||
feature_best_splits.data(), out_splits.data(),
|
||||
num_segments, reduce_offset, reduce_offset + 1);
|
||||
cub::DeviceSegmentedReduce::Sum(temp.data().get(), temp_storage_bytes, feature_best_splits.data(),
|
||||
out_splits.data(), num_segments, reduce_offset,
|
||||
reduce_offset + 1);
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
void EvaluateSingleSplit(common::Span<DeviceSplitCandidate> out_split,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientSumT> input) {
|
||||
EvaluateSplits(out_split, evaluator, input, {});
|
||||
void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT> const &input,
|
||||
common::Span<CatST> cats_out) {
|
||||
if (has_sort_) {
|
||||
dh::CUDAEvent event;
|
||||
event.Record(dh::DefaultStream());
|
||||
auto h_cats = this->HostCatStorage(input.nidx);
|
||||
copy_stream_.View().Wait(event);
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
|
||||
cudaMemcpyDeviceToHost, copy_stream_.View()));
|
||||
}
|
||||
}
|
||||
|
||||
template void EvaluateSplits<GradientPair>(
|
||||
common::Span<DeviceSplitCandidate> out_splits,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientPair> left,
|
||||
EvaluateSplitInputs<GradientPair> right);
|
||||
template void EvaluateSplits<GradientPairPrecise>(
|
||||
common::Span<DeviceSplitCandidate> out_splits,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientPairPrecise> left,
|
||||
EvaluateSplitInputs<GradientPairPrecise> right);
|
||||
template void EvaluateSingleSplit<GradientPair>(
|
||||
common::Span<DeviceSplitCandidate> out_split,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientPair> input);
|
||||
template void EvaluateSingleSplit<GradientPairPrecise>(
|
||||
common::Span<DeviceSplitCandidate> out_split,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientPairPrecise> input);
|
||||
template <typename GradientSumT>
|
||||
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, ObjInfo task,
|
||||
EvaluateSplitInputs<GradientSumT> left,
|
||||
EvaluateSplitInputs<GradientSumT> right,
|
||||
common::Span<GPUExpandEntry> out_entries) {
|
||||
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
||||
|
||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(2);
|
||||
auto out_splits = dh::ToSpan(splits_out_storage);
|
||||
this->EvaluateSplits(left, right, task, evaluator, out_splits);
|
||||
|
||||
auto d_sorted_idx = this->SortedIdx(left);
|
||||
auto d_entries = out_entries;
|
||||
auto cats_out = this->DeviceCatStorage(left.nidx);
|
||||
// turn candidate into entry, along with hanlding sort based split.
|
||||
dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) {
|
||||
auto const &input = i == 0 ? left : right;
|
||||
auto &split = out_splits[i];
|
||||
auto fidx = out_splits[i].findex;
|
||||
|
||||
if (split.is_cat &&
|
||||
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
|
||||
bool is_left = i == 0;
|
||||
auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2);
|
||||
SortBasedSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]);
|
||||
}
|
||||
|
||||
float base_weight =
|
||||
evaluator.CalcWeight(input.nidx, input.param, GradStats{split.left_sum + split.right_sum});
|
||||
float left_weight = evaluator.CalcWeight(input.nidx, input.param, GradStats{split.left_sum});
|
||||
float right_weight = evaluator.CalcWeight(input.nidx, input.param, GradStats{split.right_sum});
|
||||
|
||||
d_entries[i] = GPUExpandEntry{input.nidx, candidate.depth + 1, out_splits[i],
|
||||
base_weight, left_weight, right_weight};
|
||||
});
|
||||
|
||||
this->CopyToHost(left, cats_out);
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
|
||||
EvaluateSplitInputs<GradientSumT> input, float weight, ObjInfo task) {
|
||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(1);
|
||||
auto out_split = dh::ToSpan(splits_out);
|
||||
auto evaluator = tree_evaluator_.GetEvaluator<GPUTrainingParam>();
|
||||
this->EvaluateSplits(input, {}, task, evaluator, out_split);
|
||||
|
||||
auto cats_out = this->DeviceCatStorage(input.nidx);
|
||||
auto d_sorted_idx = this->SortedIdx(input);
|
||||
|
||||
dh::TemporaryArray<GPUExpandEntry> entries(1);
|
||||
auto d_entries = entries.data().get();
|
||||
dh::LaunchN(1, [=] __device__(size_t i) {
|
||||
auto &split = out_split[i];
|
||||
auto fidx = out_split[i].findex;
|
||||
|
||||
if (split.is_cat &&
|
||||
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
|
||||
SortBasedSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]);
|
||||
}
|
||||
|
||||
float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum});
|
||||
float right_weight = evaluator.CalcWeight(0, input.param, GradStats{split.right_sum});
|
||||
d_entries[0] = GPUExpandEntry(0, 0, split, weight, left_weight, right_weight);
|
||||
});
|
||||
this->CopyToHost(input, cats_out);
|
||||
|
||||
GPUExpandEntry root_entry;
|
||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, entries.data().get(),
|
||||
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
template class GPUHistEvaluator<GradientPair>;
|
||||
template class GPUHistEvaluator<GradientPairPrecise>;
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -3,15 +3,20 @@
|
||||
*/
|
||||
#ifndef EVALUATE_SPLITS_CUH_
|
||||
#define EVALUATE_SPLITS_CUH_
|
||||
#include <thrust/system/cuda/experimental/pinned_allocator.h>
|
||||
#include <xgboost/span.h>
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
|
||||
#include "../../common/categorical.h"
|
||||
#include "../split_evaluator.h"
|
||||
#include "../constraints.cuh"
|
||||
#include "../updater_gpu_common.cuh"
|
||||
#include "expand_entry.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace common {
|
||||
class HistogramCuts;
|
||||
}
|
||||
|
||||
namespace tree {
|
||||
template <typename GradientSumT>
|
||||
struct EvaluateSplitInputs {
|
||||
int nidx;
|
||||
@@ -23,16 +28,131 @@ struct EvaluateSplitInputs {
|
||||
common::Span<const float> feature_values;
|
||||
common::Span<const float> min_fvalue;
|
||||
common::Span<const GradientSumT> gradient_histogram;
|
||||
|
||||
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
|
||||
__device__ auto FeatureBins(bst_feature_t fidx) const {
|
||||
return feature_segments[fidx + 1] - feature_segments[fidx];
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientSumT> left,
|
||||
EvaluateSplitInputs<GradientSumT> right);
|
||||
template <typename GradientSumT>
|
||||
void EvaluateSingleSplit(common::Span<DeviceSplitCandidate> out_split,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
EvaluateSplitInputs<GradientSumT> input);
|
||||
class GPUHistEvaluator {
|
||||
using CatST = common::CatBitField::value_type; // categorical storage type
|
||||
// use pinned memory to stage the categories, used for sort based splits.
|
||||
using Alloc = thrust::system::cuda::experimental::pinned_allocator<CatST>;
|
||||
|
||||
private:
|
||||
TreeEvaluator tree_evaluator_;
|
||||
// storage for categories for each node, used for sort based splits.
|
||||
dh::device_vector<CatST> split_cats_;
|
||||
// host storage for categories for each node, used for sort based splits.
|
||||
std::vector<CatST, Alloc> h_split_cats_;
|
||||
// stream for copying categories from device back to host for expanding the decision tree.
|
||||
dh::CUDAStream copy_stream_;
|
||||
// storage for sorted index of feature histogram, used for sort based splits.
|
||||
dh::device_vector<bst_feature_t> cat_sorted_idx_;
|
||||
TrainParam param_;
|
||||
// whether the input data requires sort based split, which is more complicated so we try
|
||||
// to avoid it if possible.
|
||||
bool has_sort_{false};
|
||||
|
||||
// Copy the categories from device to host asynchronously.
|
||||
void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
|
||||
|
||||
/**
|
||||
* \brief Get host category storage of nidx for internal calculation.
|
||||
*/
|
||||
auto HostCatStorage(bst_node_t nidx) {
|
||||
auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
|
||||
if (nidx == RegTree::kRoot) {
|
||||
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
|
||||
return cats_out;
|
||||
}
|
||||
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits * 2);
|
||||
return cats_out;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get device category storage of nidx for internal calculation.
|
||||
*/
|
||||
auto DeviceCatStorage(bst_node_t nidx) {
|
||||
auto cat_bits = split_cats_.size() / param_.MaxNodes();
|
||||
if (nidx == RegTree::kRoot) {
|
||||
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits);
|
||||
return cats_out;
|
||||
}
|
||||
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits * 2);
|
||||
return cats_out;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get sorted index storage based on the left node of inputs .
|
||||
*/
|
||||
auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
|
||||
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
|
||||
return dh::ToSpan(cat_sorted_idx_).first(left.feature_values.size());
|
||||
}
|
||||
return dh::ToSpan(cat_sorted_idx_);
|
||||
}
|
||||
|
||||
public:
|
||||
GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, int32_t device)
|
||||
: tree_evaluator_{param, n_features, device}, param_{param} {}
|
||||
/**
|
||||
* \brief Reset the evaluator, should be called before any use.
|
||||
*/
|
||||
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, ObjInfo task,
|
||||
bst_feature_t n_features, TrainParam const ¶m, int32_t device);
|
||||
|
||||
/**
|
||||
* \brief Get host category storage for nidx. Different from the internal version, this
|
||||
* returns strictly 1 node.
|
||||
*/
|
||||
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
|
||||
copy_stream_.View().Sync();
|
||||
auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
|
||||
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
|
||||
return cats_out;
|
||||
}
|
||||
/**
|
||||
* \brief Add a split to the internal tree evaluator.
|
||||
*/
|
||||
void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) {
|
||||
auto &tree = *p_tree;
|
||||
// Set up child constraints
|
||||
auto left_child = tree[candidate.nid].LeftChild();
|
||||
auto right_child = tree[candidate.nid].RightChild();
|
||||
tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
|
||||
tree[candidate.nid].SplitIndex(), candidate.left_weight,
|
||||
candidate.right_weight);
|
||||
}
|
||||
|
||||
auto GetEvaluator() { return tree_evaluator_.GetEvaluator<GPUTrainingParam>(); }
|
||||
/**
|
||||
* \brief Sort the histogram based on output to obtain contiguous partitions.
|
||||
*/
|
||||
common::Span<bst_feature_t const> SortHistogram(
|
||||
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
||||
|
||||
// impl of evaluate splits, contains CUDA kernels so it's public
|
||||
void EvaluateSplits(EvaluateSplitInputs<GradientSumT> left,
|
||||
EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||
common::Span<DeviceSplitCandidate> out_splits);
|
||||
/**
|
||||
* \brief Evaluate splits for left and right nodes.
|
||||
*/
|
||||
void EvaluateSplits(GPUExpandEntry candidate, ObjInfo task,
|
||||
EvaluateSplitInputs<GradientSumT> left,
|
||||
EvaluateSplitInputs<GradientSumT> right,
|
||||
common::Span<GPUExpandEntry> out_splits);
|
||||
/**
|
||||
* \brief Evaluate splits for root node.
|
||||
*/
|
||||
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs<GradientSumT> input, float weight,
|
||||
ObjInfo task);
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
100
src/tree/gpu_hist/evaluator.cu
Normal file
100
src/tree/gpu_hist/evaluator.cu
Normal file
@@ -0,0 +1,100 @@
|
||||
/*!
|
||||
* Copyright 2022 by XGBoost Contributors
|
||||
*
|
||||
* \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
|
||||
* compilation time.
|
||||
*/
|
||||
#include <thrust/logical.h> // thrust::any_of
|
||||
#include <thrust/sort.h> // thrust::stable_sort
|
||||
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../common/hist_util.h" // common::HistogramCuts
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
template <typename GradientSumT>
|
||||
void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
|
||||
common::Span<FeatureType const> ft, ObjInfo task,
|
||||
bst_feature_t n_features, TrainParam const ¶m,
|
||||
int32_t device) {
|
||||
param_ = param;
|
||||
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
||||
if (cuts.HasCategorical() && !task.UseOneHot()) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
|
||||
auto beg = thrust::make_counting_iterator<size_t>(1ul);
|
||||
auto end = thrust::make_counting_iterator<size_t>(ptrs.size());
|
||||
auto to_onehot = param.max_cat_to_onehot;
|
||||
// This condition avoids sort-based split function calls if the users want
|
||||
// onehot-encoding-based splits.
|
||||
// For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
|
||||
has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
|
||||
auto idx = i - 1;
|
||||
if (common::IsCat(ft, idx)) {
|
||||
auto n_bins = ptrs[i] - ptrs[idx];
|
||||
bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
|
||||
return use_sort;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (has_sort_) {
|
||||
auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
|
||||
CHECK_NE(bit_storage_size, 0);
|
||||
// We need to allocate for all nodes since the updater can grow the tree layer by
|
||||
// layer, all nodes in the same layer must be preserved until that layer is
|
||||
// finished. We can allocate one layer at a time, but the best case is reducing the
|
||||
// size of the bitset by about a half, at the cost of invoking CUDA malloc many more
|
||||
// times than necessary.
|
||||
split_cats_.resize(param.MaxNodes() * bit_storage_size);
|
||||
h_split_cats_.resize(split_cats_.size());
|
||||
dh::safe_cuda(
|
||||
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
|
||||
|
||||
cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
|
||||
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
|
||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
auto sorted_idx = this->SortedIdx(left);
|
||||
dh::Iota(sorted_idx);
|
||||
// sort 2 nodes and all the features at the same time, disregarding colmun sampling.
|
||||
thrust::stable_sort(
|
||||
thrust::cuda::par(alloc), dh::tbegin(sorted_idx), dh::tend(sorted_idx),
|
||||
[evaluator, left, right] XGBOOST_DEVICE(size_t l, size_t r) {
|
||||
auto l_is_left = l < left.feature_values.size();
|
||||
auto r_is_left = r < left.feature_values.size();
|
||||
if (l_is_left != r_is_left) {
|
||||
return l_is_left; // not the same node
|
||||
}
|
||||
|
||||
auto const &input = l_is_left ? left : right;
|
||||
l -= (l_is_left ? 0 : input.feature_values.size());
|
||||
r -= (r_is_left ? 0 : input.feature_values.size());
|
||||
|
||||
auto lfidx = dh::SegmentId(input.feature_segments, l);
|
||||
auto rfidx = dh::SegmentId(input.feature_segments, r);
|
||||
if (lfidx != rfidx) {
|
||||
return lfidx < rfidx; // not the same feature
|
||||
}
|
||||
if (common::IsCat(input.feature_types, lfidx)) {
|
||||
auto lw = evaluator.CalcWeightCat(input.param, input.gradient_histogram[l]);
|
||||
auto rw = evaluator.CalcWeightCat(input.param, input.gradient_histogram[r]);
|
||||
return lw < rw;
|
||||
}
|
||||
return l < r;
|
||||
});
|
||||
return dh::ToSpan(cat_sorted_idx_);
|
||||
}
|
||||
|
||||
template class GPUHistEvaluator<GradientPair>;
|
||||
template class GPUHistEvaluator<GradientPairPrecise>;
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -4,8 +4,9 @@
|
||||
#ifndef EXPAND_ENTRY_CUH_
|
||||
#define EXPAND_ENTRY_CUH_
|
||||
#include <xgboost/span.h>
|
||||
|
||||
#include "../param.h"
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "../updater_gpu_common.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
@@ -53,7 +53,6 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
enum SplitType { kNum = 0, kOneHot = 1, kPart = 2 };
|
||||
|
||||
// Enumerate/Scan the split values of specific feature
|
||||
// Returns the sum of gradients corresponding to the data points that contains
|
||||
@@ -137,7 +136,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
|
||||
static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum},
|
||||
GradStats{right_sum}) -
|
||||
parent.root_gain);
|
||||
split_pt = cut_val[i];
|
||||
split_pt = cut_val[i]; // not used for partition based
|
||||
improved = best.Update(loss_chg, fidx, split_pt, d_step == -1, split_type != kNum,
|
||||
left_sum, right_sum);
|
||||
} else {
|
||||
@@ -180,10 +179,10 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
|
||||
|
||||
if (d_step == 1) {
|
||||
std::for_each(sorted_idx.begin(), sorted_idx.begin() + (best_thresh - ibegin + 1),
|
||||
[&cat_bits](size_t c) { cat_bits.Set(c); });
|
||||
[&](size_t c) { cat_bits.Set(cut_val[c + ibegin]); });
|
||||
} else {
|
||||
std::for_each(sorted_idx.rbegin(), sorted_idx.rbegin() + (ibegin - best_thresh),
|
||||
[&cat_bits](size_t c) { cat_bits.Set(c); });
|
||||
[&](size_t c) { cat_bits.Set(cut_val[c + cut_ptr[fidx]]); });
|
||||
}
|
||||
}
|
||||
p_best->Update(best);
|
||||
@@ -231,6 +230,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
|
||||
}
|
||||
}
|
||||
auto evaluator = tree_evaluator_.GetEvaluator();
|
||||
auto const& cut_ptrs = cut.Ptrs();
|
||||
|
||||
common::ParallelFor2d(space, n_threads_, [&](size_t nidx_in_set, common::Range1d r) {
|
||||
auto tidx = omp_get_thread_num();
|
||||
@@ -246,26 +246,22 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
|
||||
continue;
|
||||
}
|
||||
if (is_cat) {
|
||||
auto n_bins = cut.Ptrs().at(fidx + 1) - cut.Ptrs()[fidx];
|
||||
auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
|
||||
if (common::UseOneHot(n_bins, param_.max_cat_to_onehot, task_)) {
|
||||
EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
|
||||
EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
|
||||
} else {
|
||||
auto const &cut_ptr = cut.Ptrs();
|
||||
std::vector<size_t> sorted_idx(n_bins);
|
||||
std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
|
||||
auto feat_hist = histogram.subspan(cut_ptr[fidx], n_bins);
|
||||
auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins);
|
||||
// Sort the histogram to get contiguous partitions.
|
||||
std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) {
|
||||
auto ret = evaluator.CalcWeightCat(param_, feat_hist[l]) <
|
||||
evaluator.CalcWeightCat(param_, feat_hist[r]);
|
||||
static_assert(std::is_same<decltype(ret), bool>::value, "");
|
||||
return ret;
|
||||
});
|
||||
auto grad_stats =
|
||||
EnumerateSplit<+1, kPart>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
|
||||
if (SplitContainsMissingValues(grad_stats, snode_[nidx])) {
|
||||
EnumerateSplit<-1, kPart>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
|
||||
}
|
||||
EnumerateSplit<+1, kPart>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
|
||||
EnumerateSplit<-1, kPart>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
|
||||
}
|
||||
} else {
|
||||
auto grad_stats =
|
||||
@@ -313,6 +309,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
|
||||
cat_bits.Set(cat);
|
||||
} else {
|
||||
split_cats = candidate.split.cat_bits;
|
||||
common::CatBitField cat_bits{split_cats};
|
||||
}
|
||||
|
||||
tree.ExpandCategorical(
|
||||
|
||||
@@ -110,6 +110,9 @@ class TreeEvaluator {
|
||||
|
||||
template <typename GradientSumT>
|
||||
XGBOOST_DEVICE double CalcWeightCat(ParamT const& param, GradientSumT const& stats) const {
|
||||
// FIXME(jiamingy): This is a temporary solution until we have categorical feature
|
||||
// specific regularization parameters. During sorting we should try to avoid any
|
||||
// regularization.
|
||||
return ::xgboost::tree::CalcWeight(param, stats);
|
||||
}
|
||||
|
||||
@@ -180,6 +183,15 @@ class TreeEvaluator {
|
||||
.Eval(&lower_bounds_, &upper_bounds_, &monotone_);
|
||||
}
|
||||
};
|
||||
|
||||
enum SplitType {
|
||||
// numerical split
|
||||
kNum = 0,
|
||||
// onehot encoding based categorical split
|
||||
kOneHot = 1,
|
||||
// partition-based categorical split
|
||||
kPart = 2
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/random.h"
|
||||
#include "param.h"
|
||||
@@ -27,6 +28,7 @@ struct GPUTrainingParam {
|
||||
// default=0 means no constraint on weight delta
|
||||
float max_delta_step;
|
||||
float learning_rate;
|
||||
uint32_t max_cat_to_onehot;
|
||||
|
||||
GPUTrainingParam() = default;
|
||||
|
||||
@@ -35,14 +37,10 @@ struct GPUTrainingParam {
|
||||
reg_lambda(param.reg_lambda),
|
||||
reg_alpha(param.reg_alpha),
|
||||
max_delta_step(param.max_delta_step),
|
||||
learning_rate{param.learning_rate} {}
|
||||
learning_rate{param.learning_rate},
|
||||
max_cat_to_onehot{param.max_cat_to_onehot} {}
|
||||
};
|
||||
|
||||
using NodeIdT = int32_t;
|
||||
|
||||
/** used to assign default id to a Node */
|
||||
static const bst_node_t kUnusedNode = -1;
|
||||
|
||||
/**
|
||||
* @enum DefaultDirection node.cuh
|
||||
* @brief Default direction to be followed in case of missing values
|
||||
@@ -59,6 +57,8 @@ struct DeviceSplitCandidate {
|
||||
DefaultDirection dir {kLeftDir};
|
||||
int findex {-1};
|
||||
float fvalue {0};
|
||||
|
||||
common::CatBitField split_cats;
|
||||
bool is_cat { false };
|
||||
|
||||
GradientPairPrecise left_sum;
|
||||
@@ -75,6 +75,28 @@ struct DeviceSplitCandidate {
|
||||
*this = other;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* \brief The largest encoded category in the split bitset
|
||||
*/
|
||||
bst_cat_t MaxCat() const {
|
||||
// Reuse the fvalue for categorical values.
|
||||
return static_cast<bst_cat_t>(fvalue);
|
||||
}
|
||||
/**
|
||||
* \brief Return the best threshold for cat split, reset the value after return.
|
||||
*/
|
||||
XGBOOST_DEVICE size_t PopBestThresh() {
|
||||
// fvalue is also being used for storing the threshold for categorical split
|
||||
auto best_thresh = static_cast<size_t>(this->fvalue);
|
||||
this->fvalue = 0;
|
||||
return best_thresh;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE void SetCat(T c) {
|
||||
this->split_cats.Set(common::AsCat(c));
|
||||
fvalue = std::max(this->fvalue, static_cast<float>(c));
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in,
|
||||
float fvalue_in, int findex_in,
|
||||
@@ -108,18 +130,6 @@ struct DeviceSplitCandidate {
|
||||
}
|
||||
};
|
||||
|
||||
struct DeviceSplitCandidateReduceOp {
|
||||
GPUTrainingParam param;
|
||||
explicit DeviceSplitCandidateReduceOp(GPUTrainingParam param) : param(std::move(param)) {}
|
||||
XGBOOST_DEVICE DeviceSplitCandidate operator()(
|
||||
const DeviceSplitCandidate& a, const DeviceSplitCandidate& b) const {
|
||||
DeviceSplitCandidate best;
|
||||
best.Update(a, param);
|
||||
best.Update(b, param);
|
||||
return best;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct SumCallbackOp {
|
||||
// Running prefix
|
||||
|
||||
@@ -159,6 +159,10 @@ class DeviceHistogram {
|
||||
// Manage memory for a single GPU
|
||||
template <typename GradientSumT>
|
||||
struct GPUHistMakerDevice {
|
||||
private:
|
||||
GPUHistEvaluator<GradientSumT> evaluator_;
|
||||
|
||||
public:
|
||||
int device_id;
|
||||
EllpackPageImpl const* page;
|
||||
common::Span<FeatureType const> feature_types;
|
||||
@@ -182,7 +186,6 @@ struct GPUHistMakerDevice {
|
||||
dh::PinnedMemory pinned;
|
||||
|
||||
common::Monitor monitor;
|
||||
TreeEvaluator tree_evaluator;
|
||||
common::ColumnSampler column_sampler;
|
||||
FeatureInteractionConstraintDevice interaction_constraints;
|
||||
|
||||
@@ -192,24 +195,20 @@ struct GPUHistMakerDevice {
|
||||
// Storing split categories for last node.
|
||||
dh::caching_device_vector<uint32_t> node_categories;
|
||||
|
||||
GPUHistMakerDevice(int _device_id,
|
||||
EllpackPageImpl const* _page,
|
||||
common::Span<FeatureType const> _feature_types,
|
||||
bst_uint _n_rows,
|
||||
TrainParam _param,
|
||||
uint32_t column_sampler_seed,
|
||||
uint32_t n_features,
|
||||
GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page,
|
||||
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
|
||||
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
|
||||
BatchParam _batch_param)
|
||||
: device_id(_device_id),
|
||||
: evaluator_{_param, n_features, _device_id},
|
||||
device_id(_device_id),
|
||||
page(_page),
|
||||
feature_types{_feature_types},
|
||||
param(std::move(_param)),
|
||||
tree_evaluator(param, n_features, _device_id),
|
||||
column_sampler(column_sampler_seed),
|
||||
interaction_constraints(param, n_features),
|
||||
batch_param(std::move(_batch_param)) {
|
||||
sampler.reset(new GradientBasedSampler(
|
||||
page, _n_rows, batch_param, param.subsample, param.sampling_method));
|
||||
sampler.reset(new GradientBasedSampler(page, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method));
|
||||
if (!param.monotone_constraints.empty()) {
|
||||
// Copy assigning an empty vector causes an exception in MSVC debug builds
|
||||
monotone_constraints = param.monotone_constraints;
|
||||
@@ -219,9 +218,8 @@ struct GPUHistMakerDevice {
|
||||
// Init histogram
|
||||
hist.Init(device_id, page->Cuts().TotalBins());
|
||||
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(device_id));
|
||||
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(device_id),
|
||||
sizeof(GradientSumT)));
|
||||
feature_groups.reset(new FeatureGroups(
|
||||
page->Cuts(), page->is_dense, dh::MaxSharedMemoryOptin(device_id), sizeof(GradientSumT)));
|
||||
}
|
||||
|
||||
~GPUHistMakerDevice() { // NOLINT
|
||||
@@ -231,13 +229,17 @@ struct GPUHistMakerDevice {
|
||||
// Reset values for each update iteration
|
||||
// Note that the column sampler must be passed by value because it is not
|
||||
// thread safe
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns,
|
||||
ObjInfo task) {
|
||||
auto const& info = dmat->Info();
|
||||
this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
tree_evaluator = TreeEvaluator(param, dmat->Info().num_col_, device_id);
|
||||
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
|
||||
device_id);
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPairPrecise{});
|
||||
|
||||
@@ -258,10 +260,8 @@ struct GPUHistMakerDevice {
|
||||
hist.Reset();
|
||||
}
|
||||
|
||||
|
||||
DeviceSplitCandidate EvaluateRootSplit(GradientPairPrecise root_sum) {
|
||||
GPUExpandEntry EvaluateRootSplit(GradientPairPrecise root_sum, float weight, ObjInfo task) {
|
||||
int nidx = RegTree::kRoot;
|
||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(1);
|
||||
GPUTrainingParam gpu_param(param);
|
||||
auto sampled_features = column_sampler.GetFeatureSet(0);
|
||||
sampled_features->SetDevice(device_id);
|
||||
@@ -277,32 +277,23 @@ struct GPUHistMakerDevice {
|
||||
matrix.gidx_fvalue_map,
|
||||
matrix.min_fvalue,
|
||||
hist.GetNodeHistogram(nidx)};
|
||||
auto gain_calc = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSingleSplit(dh::ToSpan(splits_out), gain_calc, inputs);
|
||||
std::vector<DeviceSplitCandidate> result(1);
|
||||
dh::safe_cuda(cudaMemcpy(result.data(), splits_out.data().get(),
|
||||
sizeof(DeviceSplitCandidate) * splits_out.size(),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return result.front();
|
||||
auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight, task);
|
||||
return split;
|
||||
}
|
||||
|
||||
void EvaluateLeftRightSplits(
|
||||
GPUExpandEntry candidate, int left_nidx, int right_nidx, const RegTree& tree,
|
||||
common::Span<GPUExpandEntry> pinned_candidates_out) {
|
||||
void EvaluateLeftRightSplits(GPUExpandEntry candidate, ObjInfo task, int left_nidx,
|
||||
int right_nidx, const RegTree& tree,
|
||||
common::Span<GPUExpandEntry> pinned_candidates_out) {
|
||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
|
||||
GPUTrainingParam gpu_param(param);
|
||||
auto left_sampled_features =
|
||||
column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
|
||||
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
|
||||
left_sampled_features->SetDevice(device_id);
|
||||
common::Span<bst_feature_t> left_feature_set =
|
||||
interaction_constraints.Query(left_sampled_features->DeviceSpan(),
|
||||
left_nidx);
|
||||
auto right_sampled_features =
|
||||
column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
|
||||
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||
right_sampled_features->SetDevice(device_id);
|
||||
common::Span<bst_feature_t> right_feature_set =
|
||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
|
||||
left_nidx);
|
||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
|
||||
auto matrix = page->GetDeviceAccessor(device_id);
|
||||
|
||||
EvaluateSplitInputs<GradientSumT> left{left_nidx,
|
||||
@@ -323,29 +314,11 @@ struct GPUHistMakerDevice {
|
||||
matrix.gidx_fvalue_map,
|
||||
matrix.min_fvalue,
|
||||
hist.GetNodeHistogram(right_nidx)};
|
||||
auto d_splits_out = dh::ToSpan(splits_out);
|
||||
EvaluateSplits(d_splits_out, tree_evaluator.GetEvaluator<GPUTrainingParam>(), left, right);
|
||||
|
||||
dh::TemporaryArray<GPUExpandEntry> entries(2);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
auto d_entries = entries.data().get();
|
||||
dh::LaunchN(2, [=] __device__(size_t idx) {
|
||||
auto split = d_splits_out[idx];
|
||||
auto nidx = idx == 0 ? left_nidx : right_nidx;
|
||||
|
||||
float base_weight = evaluator.CalcWeight(
|
||||
nidx, gpu_param, GradStats{split.left_sum + split.right_sum});
|
||||
float left_weight =
|
||||
evaluator.CalcWeight(nidx, gpu_param, GradStats{split.left_sum});
|
||||
float right_weight = evaluator.CalcWeight(
|
||||
nidx, gpu_param, GradStats{split.right_sum});
|
||||
|
||||
d_entries[idx] =
|
||||
GPUExpandEntry{nidx, candidate.depth + 1, d_splits_out[idx],
|
||||
base_weight, left_weight, right_weight};
|
||||
});
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
pinned_candidates_out.data(), entries.data().get(),
|
||||
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
|
||||
this->evaluator_.EvaluateSplits(candidate, task, left, right, dh::ToSpan(entries));
|
||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(),
|
||||
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void BuildHist(int nidx) {
|
||||
@@ -369,12 +342,10 @@ struct GPUHistMakerDevice {
|
||||
});
|
||||
}
|
||||
|
||||
bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram,
|
||||
int nidx_subtraction) {
|
||||
bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
|
||||
// Make sure histograms are already allocated
|
||||
hist.AllocateHistogram(nidx_subtraction);
|
||||
return hist.HistogramExists(nidx_histogram) &&
|
||||
hist.HistogramExists(nidx_parent);
|
||||
return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
|
||||
}
|
||||
|
||||
void UpdatePosition(int nidx, RegTree* p_tree) {
|
||||
@@ -503,13 +474,12 @@ struct GPUHistMakerDevice {
|
||||
cudaMemcpyHostToDevice));
|
||||
auto d_position = row_partitioner->GetPosition();
|
||||
auto d_node_sum_gradients = device_node_sum_gradients.data().get();
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
auto tree_evaluator = evaluator_.GetEvaluator();
|
||||
|
||||
dh::LaunchN(d_ridx.size(), [=, out_preds_d = out_preds_d] __device__(
|
||||
int local_idx) mutable {
|
||||
dh::LaunchN(d_ridx.size(), [=, out_preds_d = out_preds_d] __device__(int local_idx) mutable {
|
||||
int pos = d_position[local_idx];
|
||||
bst_float weight = evaluator.CalcWeight(
|
||||
pos, param_d, GradStats{d_node_sum_gradients[pos]});
|
||||
bst_float weight =
|
||||
tree_evaluator.CalcWeight(pos, param_d, GradStats{d_node_sum_gradients[pos]});
|
||||
static_assert(!std::is_const<decltype(out_preds_d)>::value, "");
|
||||
out_preds_d(d_ridx[local_idx]) += weight * param_d.learning_rate;
|
||||
});
|
||||
@@ -562,7 +532,6 @@ struct GPUHistMakerDevice {
|
||||
|
||||
void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) {
|
||||
RegTree& tree = *p_tree;
|
||||
auto evaluator = tree_evaluator.GetEvaluator();
|
||||
auto parent_sum = candidate.split.left_sum + candidate.split.right_sum;
|
||||
auto base_weight = candidate.base_weight;
|
||||
auto left_weight = candidate.left_weight * param.learning_rate;
|
||||
@@ -572,48 +541,50 @@ struct GPUHistMakerDevice {
|
||||
if (is_cat) {
|
||||
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
||||
<< "Categorical feature value too large.";
|
||||
if (common::InvalidCat(candidate.split.fvalue)) {
|
||||
common::InvalidCategory();
|
||||
std::vector<uint32_t> split_cats;
|
||||
if (candidate.split.split_cats.Bits().empty()) {
|
||||
if (common::InvalidCat(candidate.split.fvalue)) {
|
||||
common::InvalidCategory();
|
||||
}
|
||||
auto cat = common::AsCat(candidate.split.fvalue);
|
||||
split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0);
|
||||
common::CatBitField cats_bits(split_cats);
|
||||
cats_bits.Set(cat);
|
||||
dh::CopyToD(split_cats, &node_categories);
|
||||
} else {
|
||||
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
|
||||
auto max_cat = candidate.split.MaxCat();
|
||||
split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
|
||||
CHECK_LE(split_cats.size(), h_cats.size());
|
||||
std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
|
||||
|
||||
node_categories.resize(candidate.split.split_cats.Bits().size());
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
node_categories.data().get(), candidate.split.split_cats.Data(),
|
||||
candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
auto cat = common::AsCat(candidate.split.fvalue);
|
||||
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat + 1, 1)), 0);
|
||||
LBitField32 cats_bits(split_cats);
|
||||
cats_bits.Set(cat);
|
||||
dh::CopyToD(split_cats, &node_categories);
|
||||
|
||||
tree.ExpandCategorical(
|
||||
candidate.nid, candidate.split.findex, split_cats,
|
||||
candidate.split.dir == kLeftDir, base_weight, left_weight,
|
||||
right_weight, candidate.split.loss_chg, parent_sum.GetHess(),
|
||||
candidate.split.left_sum.GetHess(),
|
||||
candidate.split.right_sum.GetHess());
|
||||
candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
|
||||
base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(),
|
||||
candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
|
||||
} else {
|
||||
tree.ExpandNode(candidate.nid, candidate.split.findex,
|
||||
candidate.split.fvalue, candidate.split.dir == kLeftDir,
|
||||
base_weight, left_weight, right_weight,
|
||||
tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue,
|
||||
candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
|
||||
candidate.split.loss_chg, parent_sum.GetHess(),
|
||||
candidate.split.left_sum.GetHess(),
|
||||
candidate.split.right_sum.GetHess());
|
||||
candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
|
||||
}
|
||||
evaluator_.ApplyTreeSplit(candidate, p_tree);
|
||||
|
||||
// Set up child constraints
|
||||
auto left_child = tree[candidate.nid].LeftChild();
|
||||
auto right_child = tree[candidate.nid].RightChild();
|
||||
node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum;
|
||||
node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum;
|
||||
|
||||
tree_evaluator.AddSplit(candidate.nid, left_child, right_child,
|
||||
tree[candidate.nid].SplitIndex(), candidate.left_weight,
|
||||
candidate.right_weight);
|
||||
node_sum_gradients[tree[candidate.nid].LeftChild()] =
|
||||
candidate.split.left_sum;
|
||||
node_sum_gradients[tree[candidate.nid].RightChild()] =
|
||||
candidate.split.right_sum;
|
||||
|
||||
interaction_constraints.Split(
|
||||
candidate.nid, tree[candidate.nid].SplitIndex(),
|
||||
tree[candidate.nid].LeftChild(),
|
||||
interaction_constraints.Split(candidate.nid, tree[candidate.nid].SplitIndex(),
|
||||
tree[candidate.nid].LeftChild(),
|
||||
tree[candidate.nid].RightChild());
|
||||
}
|
||||
|
||||
GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
|
||||
GPUExpandEntry InitRoot(RegTree* p_tree, ObjInfo task, dh::AllReducer* reducer) {
|
||||
constexpr bst_node_t kRootNIdx = 0;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
auto gpair_it = dh::MakeTransformIterator<GradientPairPrecise>(
|
||||
@@ -634,39 +605,21 @@ struct GPUHistMakerDevice {
|
||||
(*p_tree)[kRootNIdx].SetLeaf(param.learning_rate * weight);
|
||||
|
||||
// Generate first split
|
||||
auto split = this->EvaluateRootSplit(root_sum);
|
||||
dh::TemporaryArray<GPUExpandEntry> entries(1);
|
||||
auto d_entries = entries.data().get();
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
GPUTrainingParam gpu_param(param);
|
||||
auto depth = p_tree->GetDepth(kRootNIdx);
|
||||
dh::LaunchN(1, [=] __device__(size_t idx) {
|
||||
float left_weight = evaluator.CalcWeight(kRootNIdx, gpu_param,
|
||||
GradStats{split.left_sum});
|
||||
float right_weight = evaluator.CalcWeight(
|
||||
kRootNIdx, gpu_param, GradStats{split.right_sum});
|
||||
d_entries[0] =
|
||||
GPUExpandEntry(kRootNIdx, depth, split,
|
||||
weight, left_weight, right_weight);
|
||||
});
|
||||
GPUExpandEntry root_entry;
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
&root_entry, entries.data().get(),
|
||||
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
|
||||
auto root_entry = this->EvaluateRootSplit(root_sum, weight, task);
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
|
||||
RegTree* p_tree, dh::AllReducer* reducer) {
|
||||
auto& tree = *p_tree;
|
||||
Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
|
||||
|
||||
monitor.Start("Reset");
|
||||
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
|
||||
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_, task);
|
||||
monitor.Stop("Reset");
|
||||
|
||||
monitor.Start("InitRoot");
|
||||
driver.Push({ this->InitRoot(p_tree, reducer) });
|
||||
driver.Push({ this->InitRoot(p_tree, task, reducer) });
|
||||
monitor.Stop("InitRoot");
|
||||
|
||||
auto num_leaves = 1;
|
||||
@@ -703,8 +656,7 @@ struct GPUHistMakerDevice {
|
||||
monitor.Stop("BuildHist");
|
||||
|
||||
monitor.Start("EvaluateSplits");
|
||||
this->EvaluateLeftRightSplits(candidate, left_child_nidx,
|
||||
right_child_nidx, *p_tree,
|
||||
this->EvaluateLeftRightSplits(candidate, task, left_child_nidx, right_child_nidx, *p_tree,
|
||||
new_candidates.subspan(i * 2, 2));
|
||||
monitor.Stop("EvaluateSplits");
|
||||
} else {
|
||||
@@ -819,14 +771,13 @@ class GPUHistMakerSpecialised {
|
||||
CHECK(*local_tree == reference_tree);
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
RegTree* p_tree) {
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree) {
|
||||
monitor_.Start("InitData");
|
||||
this->InitData(p_fmat);
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(device_);
|
||||
maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
|
||||
maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_);
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache(const DMatrix *data,
|
||||
|
||||
Reference in New Issue
Block a user