Support optimal partitioning for GPU hist. (#7652)

* Implement `MaxCategory` in quantile.
* Implement partition-based split for GPU evaluation.  Currently, it's based on the existing evaluation function.
* Extract an evaluator from GPU Hist to store the needed states.
* Added some CUDA stream/event utilities.
* Update document with references.
* Fixed a bug in approx evaluator where the number of data points is less than the number of categories.
This commit is contained in:
Jiaming Yuan
2022-02-15 03:03:12 +08:00
committed by GitHub
parent 2369d55e9a
commit 0d0abe1845
26 changed files with 1088 additions and 528 deletions

View File

@@ -16,6 +16,10 @@
namespace xgboost {
namespace common {
using CatBitField = LBitField32;
using KCatBitField = CLBitField32;
// Cast the categorical type.
template <typename T>
XGBOOST_DEVICE bst_cat_t AsCat(T const& v) {
@@ -57,6 +61,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
return dft_left;
}
auto pos = KCatBitField::ToBitPos(cat);
if (pos.int_pos >= cats.size()) {
return true;
}
return !s_cats.Check(AsCat(cat));
}
@@ -73,18 +82,14 @@ inline void InvalidCategory() {
/*!
* \brief Whether should we use onehot encoding for categorical data.
*/
inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
bool use_one_hot = n_cats < max_cat_to_onehot ||
(task.task != ObjInfo::kRegression && task.task != ObjInfo::kBinary);
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
return use_one_hot;
}
struct IsCatOp {
XGBOOST_DEVICE bool operator()(FeatureType ft) { return ft == FeatureType::kCategorical; }
};
using CatBitField = LBitField32;
using KCatBitField = CLBitField32;
} // namespace common
} // namespace xgboost

View File

@@ -952,22 +952,22 @@ thrust::device_ptr<T const> tcend(xgboost::HostDeviceVector<T> const& vector) {
}
template <typename T>
thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) { // NOLINT
XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T>& span) { // NOLINT
return thrust::device_ptr<T>(span.data());
}
template <typename T>
thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) { // NOLINT
XGBOOST_DEVICE thrust::device_ptr<T> tbegin(xgboost::common::Span<T> const& span) { // NOLINT
return thrust::device_ptr<T>(span.data());
}
template <typename T>
thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) { // NOLINT
XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T>& span) { // NOLINT
return tbegin(span) + span.size();
}
template <typename T>
thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) { // NOLINT
XGBOOST_DEVICE thrust::device_ptr<T> tend(xgboost::common::Span<T> const& span) { // NOLINT
return tbegin(span) + span.size();
}
@@ -982,12 +982,12 @@ XGBOOST_DEVICE auto trend(xgboost::common::Span<T> &span) { // NOLINT
}
template <typename T>
thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) { // NOLINT
XGBOOST_DEVICE thrust::device_ptr<T const> tcbegin(xgboost::common::Span<T> const& span) { // NOLINT
return thrust::device_ptr<T const>(span.data());
}
template <typename T>
thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) { // NOLINT
XGBOOST_DEVICE thrust::device_ptr<T const> tcend(xgboost::common::Span<T> const& span) { // NOLINT
return tcbegin(span) + span.size();
}
@@ -1536,4 +1536,69 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
}
class CUDAStreamView;
class CUDAEvent {
cudaEvent_t event_{nullptr};
public:
CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
~CUDAEvent() {
if (event_) {
dh::safe_cuda(cudaEventDestroy(event_));
}
}
CUDAEvent(CUDAEvent const &that) = delete;
CUDAEvent &operator=(CUDAEvent const &that) = delete;
inline void Record(CUDAStreamView stream); // NOLINT
operator cudaEvent_t() const { return event_; } // NOLINT
};
class CUDAStreamView {
cudaStream_t stream_{nullptr};
public:
explicit CUDAStreamView(cudaStream_t s) : stream_{s} {}
void Wait(CUDAEvent const &e) {
#if defined(__CUDACC_VER_MAJOR__)
#if __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0
// CUDA == 11.0
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, 0));
#else
// CUDA > 11.0
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
#endif // __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0:
#else // clang
dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
#endif // defined(__CUDACC_VER_MAJOR__)
}
operator cudaStream_t() const { // NOLINT
return stream_;
}
void Sync() { dh::safe_cuda(cudaStreamSynchronize(stream_)); }
};
inline void CUDAEvent::Record(CUDAStreamView stream) { // NOLINT
dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
}
inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
class CUDAStream {
cudaStream_t stream_;
public:
CUDAStream() {
dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
}
~CUDAStream() {
dh::safe_cuda(cudaStreamDestroy(stream_));
}
CUDAStreamView View() const { return CUDAStreamView{stream_}; }
};
} // namespace dh

View File

@@ -33,66 +33,84 @@ namespace common {
*/
using GHistIndexRow = Span<uint32_t const>;
// A CSC matrix representing histogram cuts, used in CPU quantile hist.
// A CSC matrix representing histogram cuts.
// The cut values represent upper bounds of bins containing approximately equal numbers of elements
class HistogramCuts {
bool has_categorical_{false};
float max_cat_{-1.0f};
protected:
using BinIdx = uint32_t;
void Swap(HistogramCuts&& that) noexcept(true) {
std::swap(cut_values_, that.cut_values_);
std::swap(cut_ptrs_, that.cut_ptrs_);
std::swap(min_vals_, that.min_vals_);
std::swap(has_categorical_, that.has_categorical_);
std::swap(max_cat_, that.max_cat_);
}
void Copy(HistogramCuts const& that) {
cut_values_.Resize(that.cut_values_.Size());
cut_ptrs_.Resize(that.cut_ptrs_.Size());
min_vals_.Resize(that.min_vals_.Size());
cut_values_.Copy(that.cut_values_);
cut_ptrs_.Copy(that.cut_ptrs_);
min_vals_.Copy(that.min_vals_);
has_categorical_ = that.has_categorical_;
max_cat_ = that.max_cat_;
}
public:
HostDeviceVector<bst_float> cut_values_; // NOLINT
HostDeviceVector<uint32_t> cut_ptrs_; // NOLINT
HostDeviceVector<float> cut_values_; // NOLINT
HostDeviceVector<uint32_t> cut_ptrs_; // NOLINT
// storing minimum value in a sketch set.
HostDeviceVector<float> min_vals_; // NOLINT
HistogramCuts();
HistogramCuts(HistogramCuts const& that) {
cut_values_.Resize(that.cut_values_.Size());
cut_ptrs_.Resize(that.cut_ptrs_.Size());
min_vals_.Resize(that.min_vals_.Size());
cut_values_.Copy(that.cut_values_);
cut_ptrs_.Copy(that.cut_ptrs_);
min_vals_.Copy(that.min_vals_);
}
HistogramCuts(HistogramCuts const& that) { this->Copy(that); }
HistogramCuts(HistogramCuts&& that) noexcept(true) {
*this = std::forward<HistogramCuts&&>(that);
this->Swap(std::forward<HistogramCuts>(that));
}
HistogramCuts& operator=(HistogramCuts const& that) {
cut_values_.Resize(that.cut_values_.Size());
cut_ptrs_.Resize(that.cut_ptrs_.Size());
min_vals_.Resize(that.min_vals_.Size());
cut_values_.Copy(that.cut_values_);
cut_ptrs_.Copy(that.cut_ptrs_);
min_vals_.Copy(that.min_vals_);
this->Copy(that);
return *this;
}
HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
cut_ptrs_ = std::move(that.cut_ptrs_);
cut_values_ = std::move(that.cut_values_);
min_vals_ = std::move(that.min_vals_);
this->Swap(std::forward<HistogramCuts>(that));
return *this;
}
uint32_t FeatureBins(uint32_t feature) const {
return cut_ptrs_.ConstHostVector().at(feature + 1) -
cut_ptrs_.ConstHostVector()[feature];
uint32_t FeatureBins(bst_feature_t feature) const {
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
}
// Getters. Cuts should be of no use after building histogram indices, but currently
// they are deeply linked with quantile_hist, gpu sketcher and gpu_hist, so we preserve
// these for now.
std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
bool HasCategorical() const { return has_categorical_; }
float MaxCategory() const { return max_cat_; }
/**
* \brief Set meta info about categorical features.
*
* \param has_cat Do we have categorical feature in the data?
* \param max_cat The maximum categorical value in all features.
*/
void SetCategorical(bool has_cat, float max_cat) {
has_categorical_ = has_cat;
max_cat_ = max_cat;
}
size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
// Return the index of a cut point that is strictly greater than the input
// value, or the last available index if none exists
BinIdx SearchBin(float value, uint32_t column_id, std::vector<uint32_t> const& ptrs,
BinIdx SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
std::vector<float> const& values) const {
auto end = ptrs[column_id + 1];
auto beg = ptrs[column_id];
@@ -102,7 +120,7 @@ class HistogramCuts {
return idx;
}
BinIdx SearchBin(float value, uint32_t column_id) const {
BinIdx SearchBin(float value, bst_feature_t column_id) const {
return this->SearchBin(value, column_id, Ptrs(), Values());
}

View File

@@ -272,7 +272,7 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
// move all categories into a flatten vector to prepare for allreduce
size_t total = feature_ptr.back();
std::vector<bst_cat_t> flatten(total, 0);
std::vector<float> flatten(total, 0);
auto cursor{flatten.begin()};
for (auto const &feat : categories) {
cursor = std::copy(feat.cbegin(), feat.cend(), cursor);
@@ -287,15 +287,15 @@ void AllreduceCategories(Span<FeatureType const> feature_types, int32_t n_thread
auto gtotal = global_worker_ptr.back();
// categories in all workers with all features.
std::vector<bst_cat_t> global_categories(gtotal, 0);
std::vector<float> global_categories(gtotal, 0);
auto rank_begin = global_worker_ptr[rank];
auto rank_size = global_worker_ptr[rank + 1] - rank_begin;
CHECK_EQ(rank_size, total);
std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
// gather values from all workers.
rabit::Allreduce<rabit::op::Sum>(global_categories.data(), global_categories.size());
QuantileAllreduce<bst_cat_t> allreduce_result{global_categories, global_worker_ptr,
global_feat_ptrs, categories.size()};
QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
categories.size()};
ParallelFor(categories.size(), n_threads, [&](auto fidx) {
if (!IsCat(feature_types, fidx)) {
return;
@@ -531,6 +531,22 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
InvalidCategory();
}
}
auto const &ptrs = cuts->Ptrs();
auto const &vals = cuts->Values();
float max_cat{-std::numeric_limits<float>::infinity()};
for (size_t i = 1; i < ptrs.size(); ++i) {
if (IsCat(feature_types_, i - 1)) {
auto beg = ptrs[i - 1];
auto end = ptrs[i];
auto feat = Span<float const>{vals}.subspan(beg, end - beg);
auto max_elem = *std::max_element(feat.cbegin(), feat.cend());
if (max_elem > max_cat) {
max_cat = max_elem;
}
}
}
cuts->SetCategorical(true, max_cat);
}
monitor_.Stop(__func__);

View File

@@ -1,22 +1,23 @@
/*!
* Copyright 2020 by XGBoost Contributors
*/
#include <thrust/unique.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/binary_search.h>
#include <thrust/transform_scan.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/transform_scan.h>
#include <thrust/unique.h>
#include <limits> // std::numeric_limits
#include <memory>
#include <utility>
#include "xgboost/span.h"
#include "quantile.h"
#include "quantile.cuh"
#include "hist_util.h"
#include "device_helpers.cuh"
#include "categorical.h"
#include "common.h"
#include "device_helpers.cuh"
#include "hist_util.h"
#include "quantile.cuh"
#include "quantile.h"
#include "xgboost/span.h"
namespace xgboost {
namespace common {
@@ -586,7 +587,7 @@ struct InvalidCatOp {
Span<uint32_t const> ptrs;
Span<FeatureType const> ft;
XGBOOST_DEVICE bool operator()(size_t i) {
XGBOOST_DEVICE bool operator()(size_t i) const {
auto fidx = dh::SegmentId(ptrs, i);
return IsCat(ft, fidx) && InvalidCat(values[i]);
}
@@ -683,18 +684,36 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
out_column[idx] = in_column[idx+1].value;
});
float max_cat{-1.0f};
if (has_categorical_) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto ptrs = p_cuts->cut_ptrs_.ConstDeviceSpan();
auto it = thrust::make_counting_iterator(0ul);
auto invalid_op = InvalidCatOp{out_cut_values, d_out_columns_ptr, d_ft};
auto it = dh::MakeTransformIterator<thrust::pair<bool, float>>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
auto fidx = dh::SegmentId(d_out_columns_ptr, i);
if (IsCat(d_ft, fidx)) {
auto invalid = invalid_op(i);
auto v = out_cut_values[i];
return thrust::make_pair(invalid, v);
}
return thrust::make_pair(false, std::numeric_limits<float>::min());
});
CHECK_EQ(p_cuts->Ptrs().back(), out_cut_values.size());
auto invalid = thrust::any_of(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
InvalidCatOp{out_cut_values, ptrs, d_ft});
bool invalid{false};
dh::XGBCachingDeviceAllocator<char> alloc;
thrust::tie(invalid, max_cat) =
thrust::reduce(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
thrust::make_pair(false, std::numeric_limits<float>::min()),
[=] XGBOOST_DEVICE(thrust::pair<bool, bst_cat_t> const &l,
thrust::pair<bool, bst_cat_t> const &r) {
return thrust::make_pair(l.first || r.first, std::max(l.second, r.second));
});
if (invalid) {
InvalidCategory();
}
}
p_cuts->SetCategorical(this->has_categorical_, max_cat);
timer_.Stop(__func__);
}
} // namespace common