Support optimal partitioning for GPU hist. (#7652)
* Implement `MaxCategory` in quantile. * Implement partition-based split for GPU evaluation. Currently, it's based on the existing evaluation function. * Extract an evaluator from GPU Hist to store the needed states. * Added some CUDA stream/event utilities. * Update document with references. * Fixed a bug in approx evaluator where the number of data points is less than the number of categories.
This commit is contained in:
@@ -16,6 +16,10 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
using CatBitField = LBitField32;
|
||||
using KCatBitField = CLBitField32;
|
||||
|
||||
// Cast the categorical type.
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE bst_cat_t AsCat(T const& v) {
|
||||
@@ -57,6 +61,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat
|
||||
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
|
||||
return dft_left;
|
||||
}
|
||||
|
||||
auto pos = KCatBitField::ToBitPos(cat);
|
||||
if (pos.int_pos >= cats.size()) {
|
||||
return true;
|
||||
}
|
||||
return !s_cats.Check(AsCat(cat));
|
||||
}
|
||||
|
||||
@@ -73,18 +82,14 @@ inline void InvalidCategory() {
|
||||
/*!
|
||||
* \brief Whether should we use onehot encoding for categorical data.
|
||||
*/
|
||||
inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
|
||||
bool use_one_hot = n_cats < max_cat_to_onehot ||
|
||||
(task.task != ObjInfo::kRegression && task.task != ObjInfo::kBinary);
|
||||
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
|
||||
bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
|
||||
return use_one_hot;
|
||||
}
|
||||
|
||||
struct IsCatOp {
|
||||
XGBOOST_DEVICE bool operator()(FeatureType ft) { return ft == FeatureType::kCategorical; }
|
||||
};
|
||||
|
||||
using CatBitField = LBitField32;
|
||||
using KCatBitField = CLBitField32;
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -952,22 +952,22 @@ thrust::device_ptr<T const> tcend(xgboost::HostDeviceVector<T> const& vector) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) { // NOLINT
|
||||
return thrust::device_ptr<T>(span.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return thrust::device_ptr<T>(span.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) { // NOLINT
|
||||
return tbegin(span) + span.size();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return tbegin(span) + span.size();
|
||||
}
|
||||
|
||||
@@ -982,12 +982,12 @@ XGBOOST_DEVICE auto trend(xgboost::common::Span<T> &span) { // NOLINT
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return thrust::device_ptr<T const>(span.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
XGBOOST_DEVICE thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) { // NOLINT
|
||||
return tcbegin(span) + span.size();
|
||||
}
|
||||
|
||||
@@ -1536,4 +1536,69 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
|
||||
safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
|
||||
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
class CUDAStreamView;
|
||||
|
||||
class CUDAEvent {
|
||||
cudaEvent_t event_{nullptr};
|
||||
|
||||
public:
|
||||
CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
|
||||
~CUDAEvent() {
|
||||
if (event_) {
|
||||
dh::safe_cuda(cudaEventDestroy(event_));
|
||||
}
|
||||
}
|
||||
|
||||
CUDAEvent(CUDAEvent const &that) = delete;
|
||||
CUDAEvent &operator=(CUDAEvent const &that) = delete;
|
||||
|
||||
inline void Record(CUDAStreamView stream); // NOLINT
|
||||
|
||||
operator cudaEvent_t() const { return event_; } // NOLINT
|
||||
};
|
||||
|
||||
class CUDAStreamView {
|
||||
cudaStream_t stream_{nullptr};
|
||||
|
||||
public:
|
||||
explicit CUDAStreamView(cudaStream_t s) : stream_{s} {}
|
||||
void Wait(CUDAEvent const &e) {
|
||||
#if defined(__CUDACC_VER_MAJOR__)
|
||||
#if __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0
|
||||
// CUDA == 11.0
|
||||
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, 0));
|
||||
#else
|
||||
// CUDA > 11.0
|
||||
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
|
||||
#endif // __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0:
|
||||
#else // clang
|
||||
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
|
||||
#endif // defined(__CUDACC_VER_MAJOR__)
|
||||
}
|
||||
operator cudaStream_t() const { // NOLINT
|
||||
return stream_;
|
||||
}
|
||||
void Sync() { dh::safe_cuda(cudaStreamSynchronize(stream_)); }
|
||||
};
|
||||
|
||||
inline void CUDAEvent::Record(CUDAStreamView stream) { // NOLINT
|
||||
dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
|
||||
}
|
||||
|
||||
inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
|
||||
|
||||
class CUDAStream {
|
||||
cudaStream_t stream_;
|
||||
|
||||
public:
|
||||
CUDAStream() {
|
||||
dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
|
||||
}
|
||||
~CUDAStream() {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream_));
|
||||
}
|
||||
|
||||
CUDAStreamView View() const { return CUDAStreamView{stream_}; }
|
||||
};
|
||||
} // namespace dh
|
||||
|
||||
@@ -33,66 +33,84 @@ namespace common {
|
||||
*/
|
||||
using GHistIndexRow = Span<uint32_t const>;
|
||||
|
||||
// A CSC matrix representing histogram cuts, used in CPU quantile hist.
|
||||
// A CSC matrix representing histogram cuts.
|
||||
// The cut values represent upper bounds of bins containing approximately equal numbers of elements
|
||||
class HistogramCuts {
|
||||
bool has_categorical_{false};
|
||||
float max_cat_{-1.0f};
|
||||
|
||||
protected:
|
||||
using BinIdx = uint32_t;
|
||||
|
||||
void Swap(HistogramCuts&& that) noexcept(true) {
|
||||
std::swap(cut_values_, that.cut_values_);
|
||||
std::swap(cut_ptrs_, that.cut_ptrs_);
|
||||
std::swap(min_vals_, that.min_vals_);
|
||||
|
||||
std::swap(has_categorical_, that.has_categorical_);
|
||||
std::swap(max_cat_, that.max_cat_);
|
||||
}
|
||||
|
||||
void Copy(HistogramCuts const& that) {
|
||||
cut_values_.Resize(that.cut_values_.Size());
|
||||
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||
min_vals_.Resize(that.min_vals_.Size());
|
||||
cut_values_.Copy(that.cut_values_);
|
||||
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||
min_vals_.Copy(that.min_vals_);
|
||||
has_categorical_ = that.has_categorical_;
|
||||
max_cat_ = that.max_cat_;
|
||||
}
|
||||
|
||||
public:
|
||||
HostDeviceVector<bst_float> cut_values_; // NOLINT
|
||||
HostDeviceVector<uint32_t> cut_ptrs_; // NOLINT
|
||||
HostDeviceVector<float> cut_values_; // NOLINT
|
||||
HostDeviceVector<uint32_t> cut_ptrs_; // NOLINT
|
||||
// storing minimum value in a sketch set.
|
||||
HostDeviceVector<float> min_vals_; // NOLINT
|
||||
|
||||
HistogramCuts();
|
||||
HistogramCuts(HistogramCuts const& that) {
|
||||
cut_values_.Resize(that.cut_values_.Size());
|
||||
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||
min_vals_.Resize(that.min_vals_.Size());
|
||||
cut_values_.Copy(that.cut_values_);
|
||||
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||
min_vals_.Copy(that.min_vals_);
|
||||
}
|
||||
HistogramCuts(HistogramCuts const& that) { this->Copy(that); }
|
||||
|
||||
HistogramCuts(HistogramCuts&& that) noexcept(true) {
|
||||
*this = std::forward<HistogramCuts&&>(that);
|
||||
this->Swap(std::forward<HistogramCuts>(that));
|
||||
}
|
||||
|
||||
HistogramCuts& operator=(HistogramCuts const& that) {
|
||||
cut_values_.Resize(that.cut_values_.Size());
|
||||
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||
min_vals_.Resize(that.min_vals_.Size());
|
||||
cut_values_.Copy(that.cut_values_);
|
||||
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||
min_vals_.Copy(that.min_vals_);
|
||||
this->Copy(that);
|
||||
return *this;
|
||||
}
|
||||
|
||||
HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
|
||||
cut_ptrs_ = std::move(that.cut_ptrs_);
|
||||
cut_values_ = std::move(that.cut_values_);
|
||||
min_vals_ = std::move(that.min_vals_);
|
||||
this->Swap(std::forward<HistogramCuts>(that));
|
||||
return *this;
|
||||
}
|
||||
|
||||
uint32_t FeatureBins(uint32_t feature) const {
|
||||
return cut_ptrs_.ConstHostVector().at(feature + 1) -
|
||||
cut_ptrs_.ConstHostVector()[feature];
|
||||
uint32_t FeatureBins(bst_feature_t feature) const {
|
||||
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
|
||||
}
|
||||
|
||||
// Getters. Cuts should be of no use after building histogram indices, but currently
|
||||
// they are deeply linked with quantile_hist, gpu sketcher and gpu_hist, so we preserve
|
||||
// these for now.
|
||||
std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
|
||||
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
|
||||
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
|
||||
|
||||
bool HasCategorical() const { return has_categorical_; }
|
||||
float MaxCategory() const { return max_cat_; }
|
||||
/**
|
||||
* \brief Set meta info about categorical features.
|
||||
*
|
||||
* \param has_cat Do we have categorical feature in the data?
|
||||
* \param max_cat The maximum categorical value in all features.
|
||||
*/
|
||||
void SetCategorical(bool has_cat, float max_cat) {
|
||||
has_categorical_ = has_cat;
|
||||
max_cat_ = max_cat;
|
||||
}
|
||||
|
||||
size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
|
||||
|
||||
// Return the index of a cut point that is strictly greater than the input
|
||||
// value, or the last available index if none exists
|
||||
BinIdx SearchBin(float value, uint32_t column_id, std::vector<uint32_t> const& ptrs,
|
||||
BinIdx SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
|
||||
std::vector<float> const& values) const {
|
||||
auto end = ptrs[column_id + 1];
|
||||
auto beg = ptrs[column_id];
|
||||
@@ -102,7 +120,7 @@ class HistogramCuts {
|
||||
return idx;
|
||||
}
|
||||
|
||||
BinIdx SearchBin(float value, uint32_t column_id) const {
|
||||
BinIdx SearchBin(float value, bst_feature_t column_id) const {
|
||||
return this->SearchBin(value, column_id, Ptrs(), Values());
|
||||
}
|
||||
|
||||
|
||||
@@ -272,7 +272,7 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
|
||||
// move all categories into a flatten vector to prepare for allreduce
|
||||
size_t total = feature_ptr.back();
|
||||
std::vector<bst_cat_t> flatten(total, 0);
|
||||
std::vector<float> flatten(total, 0);
|
||||
auto cursor{flatten.begin()};
|
||||
for (auto const &feat : categories) {
|
||||
cursor = std::copy(feat.cbegin(), feat.cend(), cursor);
|
||||
@@ -287,15 +287,15 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
|
||||
auto gtotal = global_worker_ptr.back();
|
||||
|
||||
// categories in all workers with all features.
|
||||
std::vector<bst_cat_t> global_categories(gtotal, 0);
|
||||
std::vector<float> global_categories(gtotal, 0);
|
||||
auto rank_begin = global_worker_ptr[rank];
|
||||
auto rank_size = global_worker_ptr[rank + 1] - rank_begin;
|
||||
CHECK_EQ(rank_size, total);
|
||||
std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
|
||||
// gather values from all workers.
|
||||
rabit::Allreduce<rabit::op::Sum>(global_categories.data(), global_categories.size());
|
||||
QuantileAllreduce<bst_cat_t> allreduce_result{global_categories, global_worker_ptr,
|
||||
global_feat_ptrs, categories.size()};
|
||||
QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
|
||||
categories.size()};
|
||||
ParallelFor(categories.size(), n_threads, [&](auto fidx) {
|
||||
if (!IsCat(feature_types, fidx)) {
|
||||
return;
|
||||
@@ -531,6 +531,22 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
|
||||
InvalidCategory();
|
||||
}
|
||||
}
|
||||
auto const &ptrs = cuts->Ptrs();
|
||||
auto const &vals = cuts->Values();
|
||||
|
||||
float max_cat{-std::numeric_limits<float>::infinity()};
|
||||
for (size_t i = 1; i < ptrs.size(); ++i) {
|
||||
if (IsCat(feature_types_, i - 1)) {
|
||||
auto beg = ptrs[i - 1];
|
||||
auto end = ptrs[i];
|
||||
auto feat = Span<float const>{vals}.subspan(beg, end - beg);
|
||||
auto max_elem = *std::max_element(feat.cbegin(), feat.cend());
|
||||
if (max_elem > max_cat) {
|
||||
max_cat = max_elem;
|
||||
}
|
||||
}
|
||||
}
|
||||
cuts->SetCategorical(true, max_cat);
|
||||
}
|
||||
|
||||
monitor_.Stop(__func__);
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
/*!
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/binary_search.h>
|
||||
#include <thrust/transform_scan.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/transform_scan.h>
|
||||
#include <thrust/unique.h>
|
||||
|
||||
#include <limits> // std::numeric_limits
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "quantile.h"
|
||||
#include "quantile.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "categorical.h"
|
||||
#include "common.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "quantile.cuh"
|
||||
#include "quantile.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -586,7 +587,7 @@ struct InvalidCatOp {
|
||||
Span<uint32_t const> ptrs;
|
||||
Span<FeatureType const> ft;
|
||||
|
||||
XGBOOST_DEVICE bool operator()(size_t i) {
|
||||
XGBOOST_DEVICE bool operator()(size_t i) const {
|
||||
auto fidx = dh::SegmentId(ptrs, i);
|
||||
return IsCat(ft, fidx) && InvalidCat(values[i]);
|
||||
}
|
||||
@@ -683,18 +684,36 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
|
||||
out_column[idx] = in_column[idx+1].value;
|
||||
});
|
||||
|
||||
float max_cat{-1.0f};
|
||||
if (has_categorical_) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
auto ptrs = p_cuts->cut_ptrs_.ConstDeviceSpan();
|
||||
auto it = thrust::make_counting_iterator(0ul);
|
||||
auto invalid_op = InvalidCatOp{out_cut_values, d_out_columns_ptr, d_ft};
|
||||
auto it = dh::MakeTransformIterator<thrust::pair<bool, float>>(
|
||||
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
|
||||
auto fidx = dh::SegmentId(d_out_columns_ptr, i);
|
||||
if (IsCat(d_ft, fidx)) {
|
||||
auto invalid = invalid_op(i);
|
||||
auto v = out_cut_values[i];
|
||||
return thrust::make_pair(invalid, v);
|
||||
}
|
||||
return thrust::make_pair(false, std::numeric_limits<float>::min());
|
||||
});
|
||||
|
||||
CHECK_EQ(p_cuts->Ptrs().back(), out_cut_values.size());
|
||||
auto invalid = thrust::any_of(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
|
||||
InvalidCatOp{out_cut_values, ptrs, d_ft});
|
||||
bool invalid{false};
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
thrust::tie(invalid, max_cat) =
|
||||
thrust::reduce(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
|
||||
thrust::make_pair(false, std::numeric_limits<float>::min()),
|
||||
[=] XGBOOST_DEVICE(thrust::pair<bool, bst_cat_t> const &l,
|
||||
thrust::pair<bool, bst_cat_t> const &r) {
|
||||
return thrust::make_pair(l.first || r.first, std::max(l.second, r.second));
|
||||
});
|
||||
if (invalid) {
|
||||
InvalidCategory();
|
||||
}
|
||||
}
|
||||
|
||||
p_cuts->SetCategorical(this->has_categorical_, max_cat);
|
||||
|
||||
timer_.Stop(__func__);
|
||||
}
|
||||
} // namespace common
|
||||
|
||||
Reference in New Issue
Block a user