Always use partition based categorical splits. (#7857)

This commit is contained in:
Jiaming Yuan
2022-05-03 22:30:32 +08:00
committed by GitHub
parent 90cce38236
commit 317d7be6ee
13 changed files with 79 additions and 104 deletions

View File

@@ -12,7 +12,6 @@
#include "xgboost/data.h"
#include "xgboost/parameter.h"
#include "xgboost/span.h"
#include "xgboost/task.h"
namespace xgboost {
namespace common {
@@ -82,8 +81,8 @@ inline void InvalidCategory() {
/*!
* \brief Whether should we use onehot encoding for categorical data.
*/
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot) {
bool use_one_hot = n_cats < max_cat_to_onehot;
return use_one_hot;
}

View File

@@ -199,13 +199,11 @@ __device__ void EvaluateFeature(
}
template <int BLOCK_THREADS, typename GradientSumT>
__global__ void EvaluateSplitsKernel(
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
ObjInfo task,
common::Span<bst_feature_t> sorted_idx,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) {
__global__ void EvaluateSplitsKernel(EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
common::Span<bst_feature_t> sorted_idx,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) {
// KeyValuePair here used as threadIdx.x -> gain_value
using ArgMaxT = cub::KeyValuePair<int, float>;
using BlockScanT =
@@ -241,7 +239,7 @@ __global__ void EvaluateSplitsKernel(
if (common::IsCat(inputs.feature_types, fidx)) {
auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx];
if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot, task)) {
if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) {
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
kOneHot>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
} else {
@@ -310,7 +308,7 @@ __device__ void SortBasedSplit(EvaluateSplitInputs<GradientSumT> const &input,
template <typename GradientSumT>
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits) {
if (!split_cats_.empty()) {
@@ -323,7 +321,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
// One block for each feature
uint32_t constexpr kBlockThreads = 256;
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, task, this->SortedIdx(left),
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, this->SortedIdx(left),
evaluator, dh::ToSpan(feature_best_splits));
// Reduce to get best candidate for left and right child over all features
@@ -365,7 +363,7 @@ void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT
}
template <typename GradientSumT>
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, ObjInfo task,
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
common::Span<GPUExpandEntry> out_entries) {
@@ -373,7 +371,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, Ob
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(2);
auto out_splits = dh::ToSpan(splits_out_storage);
this->EvaluateSplits(left, right, task, evaluator, out_splits);
this->EvaluateSplits(left, right, evaluator, out_splits);
auto d_sorted_idx = this->SortedIdx(left);
auto d_entries = out_entries;
@@ -385,7 +383,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, Ob
auto fidx = out_splits[i].findex;
if (split.is_cat &&
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) {
bool is_left = i == 0;
auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2);
SortBasedSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]);
@@ -405,11 +403,11 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, Ob
template <typename GradientSumT>
GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
EvaluateSplitInputs<GradientSumT> input, float weight, ObjInfo task) {
EvaluateSplitInputs<GradientSumT> input, float weight) {
dh::TemporaryArray<DeviceSplitCandidate> splits_out(1);
auto out_split = dh::ToSpan(splits_out);
auto evaluator = tree_evaluator_.GetEvaluator<GPUTrainingParam>();
this->EvaluateSplits(input, {}, task, evaluator, out_split);
this->EvaluateSplits(input, {}, evaluator, out_split);
auto cats_out = this->DeviceCatStorage(input.nidx);
auto d_sorted_idx = this->SortedIdx(input);
@@ -421,7 +419,7 @@ GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
auto fidx = out_split[i].findex;
if (split.is_cat &&
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) {
SortBasedSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]);
}

View File

@@ -114,7 +114,7 @@ class GPUHistEvaluator {
/**
* \brief Reset the evaluator, should be called before any use.
*/
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, ObjInfo task,
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param, int32_t device);
/**
@@ -150,21 +150,20 @@ class GPUHistEvaluator {
// impl of evaluate splits, contains CUDA kernels so it's public
void EvaluateSplits(EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
EvaluateSplitInputs<GradientSumT> right,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits);
/**
* \brief Evaluate splits for left and right nodes.
*/
void EvaluateSplits(GPUExpandEntry candidate, ObjInfo task,
void EvaluateSplits(GPUExpandEntry candidate,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
common::Span<GPUExpandEntry> out_splits);
/**
* \brief Evaluate splits for root node.
*/
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs<GradientSumT> input, float weight,
ObjInfo task);
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs<GradientSumT> input, float weight);
};
} // namespace tree
} // namespace xgboost

View File

@@ -16,12 +16,12 @@ namespace xgboost {
namespace tree {
template <typename GradientSumT>
void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
common::Span<FeatureType const> ft, ObjInfo task,
common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param,
int32_t device) {
param_ = param;
tree_evaluator_ = TreeEvaluator{param, n_features, device};
if (cuts.HasCategorical() && !task.UseOneHot()) {
if (cuts.HasCategorical()) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
auto beg = thrust::make_counting_iterator<size_t>(1ul);
@@ -34,7 +34,7 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
auto idx = i - 1;
if (common::IsCat(ft, idx)) {
auto n_bins = ptrs[i] - ptrs[idx];
bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
bool use_sort = !common::UseOneHot(n_bins, to_onehot);
return use_sort;
}
return false;

View File

@@ -11,7 +11,6 @@
#include <utility>
#include <vector>
#include "xgboost/task.h"
#include "../param.h"
#include "../constraints.h"
#include "../split_evaluator.h"
@@ -39,7 +38,6 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
int32_t n_threads_ {0};
FeatureInteractionConstraintHost interaction_constraints_;
std::vector<NodeEntry> snode_;
ObjInfo task_;
// if sum of statistics for non-missing values in the node
// is equal to sum of statistics for all values:
@@ -244,7 +242,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
}
if (is_cat) {
auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
if (common::UseOneHot(n_bins, param_.max_cat_to_onehot, task_)) {
if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
} else {
@@ -345,7 +343,6 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
auto const& Stats() const { return snode_; }
auto Task() const { return task_; }
float InitRoot(GradStats const& root_sum) {
snode_.resize(1);
@@ -363,12 +360,11 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
// The column sampler must be constructed by caller since we need to preserve the rng
// for the entire training session.
explicit HistEvaluator(TrainParam const &param, MetaInfo const &info, int32_t n_threads,
std::shared_ptr<common::ColumnSampler> sampler, ObjInfo task)
std::shared_ptr<common::ColumnSampler> sampler)
: param_{param},
column_sampler_{std::move(sampler)},
tree_evaluator_{param, static_cast<bst_feature_t>(info.num_col_), GenericParameter::kCpuId},
n_threads_{n_threads},
task_{task} {
n_threads_{n_threads} {
interaction_constraints_.Configure(param, info.num_col_);
column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(), param_.colsample_bynode,
param_.colsample_bylevel, param_.colsample_bytree);

View File

@@ -29,10 +29,8 @@ DMLC_REGISTRY_FILE_TAG(updater_approx);
namespace {
// Return the BatchParam used by DMatrix.
template <typename GradientSumT>
auto BatchSpec(TrainParam const &p, common::Span<float> hess,
HistEvaluator<GradientSumT, CPUExpandEntry> const &evaluator) {
return BatchParam{p.max_bin, hess, !evaluator.Task().const_hess};
auto BatchSpec(TrainParam const &p, common::Span<float> hess, ObjInfo const task) {
return BatchParam{p.max_bin, hess, !task.const_hess};
}
auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
@@ -47,7 +45,8 @@ class GloablApproxBuilder {
std::shared_ptr<common::ColumnSampler> col_sampler_;
HistEvaluator<GradientSumT, CPUExpandEntry> evaluator_;
HistogramBuilder<GradientSumT, CPUExpandEntry> histogram_builder_;
GenericParameter const *ctx_;
Context const *ctx_;
ObjInfo const task_;
std::vector<ApproxRowPartitioner> partitioner_;
// Pointer to last updated tree, used for update prediction cache.
@@ -65,8 +64,7 @@ class GloablApproxBuilder {
int32_t n_total_bins = 0;
partitioner_.clear();
// Generating the GHistIndexMatrix is quite slow, is there a way to speed it up?
for (auto const &page :
p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess, evaluator_))) {
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess, task_))) {
if (n_total_bins == 0) {
n_total_bins = page.cut.TotalBins();
feature_values_ = page.cut;
@@ -158,7 +156,7 @@ class GloablApproxBuilder {
void LeafPartition(RegTree const &tree, common::Span<float> hess,
std::vector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
if (!evaluator_.Task().UpdateTreeLeaf()) {
if (!task_.UpdateTreeLeaf()) {
return;
}
for (auto const &part : partitioner_) {
@@ -173,8 +171,9 @@ class GloablApproxBuilder {
common::Monitor *monitor)
: param_{std::move(param)},
col_sampler_{std::move(column_sampler)},
evaluator_{param_, info, ctx->Threads(), col_sampler_, task},
evaluator_{param_, info, ctx->Threads(), col_sampler_},
ctx_{ctx},
task_{task},
monitor_{monitor} {}
void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,

View File

@@ -232,16 +232,14 @@ struct GPUHistMakerDevice {
// Reset values for each update iteration
// Note that the column sampler must be passed by value because it is not
// thread safe
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns,
ObjInfo task) {
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
auto const& info = dmat->Info();
this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
ctx_->gpu_id);
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
this->interaction_constraints.Reset();
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPairPrecise{});
@@ -263,7 +261,7 @@ struct GPUHistMakerDevice {
hist.Reset();
}
GPUExpandEntry EvaluateRootSplit(GradientPairPrecise root_sum, float weight, ObjInfo task) {
GPUExpandEntry EvaluateRootSplit(GradientPairPrecise root_sum, float weight) {
int nidx = RegTree::kRoot;
GPUTrainingParam gpu_param(param);
auto sampled_features = column_sampler.GetFeatureSet(0);
@@ -280,12 +278,12 @@ struct GPUHistMakerDevice {
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(nidx)};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight, task);
auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight);
return split;
}
void EvaluateLeftRightSplits(GPUExpandEntry candidate, ObjInfo task, int left_nidx,
int right_nidx, const RegTree& tree,
void EvaluateLeftRightSplits(GPUExpandEntry candidate, int left_nidx, int right_nidx,
const RegTree& tree,
common::Span<GPUExpandEntry> pinned_candidates_out) {
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
GPUTrainingParam gpu_param(param);
@@ -319,7 +317,7 @@ struct GPUHistMakerDevice {
hist.GetNodeHistogram(right_nidx)};
dh::TemporaryArray<GPUExpandEntry> entries(2);
this->evaluator_.EvaluateSplits(candidate, task, left, right, dh::ToSpan(entries));
this->evaluator_.EvaluateSplits(candidate, left, right, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(),
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
}
@@ -613,7 +611,7 @@ struct GPUHistMakerDevice {
tree[candidate.nid].RightChild());
}
GPUExpandEntry InitRoot(RegTree* p_tree, ObjInfo task, dh::AllReducer* reducer) {
GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
constexpr bst_node_t kRootNIdx = 0;
dh::XGBCachingDeviceAllocator<char> alloc;
auto gpair_it = dh::MakeTransformIterator<GradientPairPrecise>(
@@ -634,7 +632,7 @@ struct GPUHistMakerDevice {
(*p_tree)[kRootNIdx].SetLeaf(param.learning_rate * weight);
// Generate first split
auto root_entry = this->EvaluateRootSplit(root_sum, weight, task);
auto root_entry = this->EvaluateRootSplit(root_sum, weight);
return root_entry;
}
@@ -645,11 +643,11 @@ struct GPUHistMakerDevice {
Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
monitor.Start("Reset");
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_, task);
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
monitor.Stop("Reset");
monitor.Start("InitRoot");
driver.Push({ this->InitRoot(p_tree, task, reducer) });
driver.Push({ this->InitRoot(p_tree, reducer) });
monitor.Stop("InitRoot");
auto num_leaves = 1;
@@ -686,7 +684,7 @@ struct GPUHistMakerDevice {
monitor.Stop("BuildHist");
monitor.Start("EvaluateSplits");
this->EvaluateLeftRightSplits(candidate, task, left_child_nidx, right_child_nidx, *p_tree,
this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree,
new_candidates.subspan(i * 2, 2));
monitor.Stop("EvaluateSplits");
} else {

View File

@@ -178,7 +178,7 @@ void QuantileHistMaker::Builder<GradientSumT>::LeafPartition(
RegTree const &tree, common::Span<GradientPair const> gpair,
std::vector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
if (!evaluator_->Task().UpdateTreeLeaf()) {
if (!task_.UpdateTreeLeaf()) {
return;
}
for (auto const &part : partitioner_) {
@@ -363,7 +363,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(DMatrix *fmat, const Reg
// store a pointer to the tree
p_last_tree_ = &tree;
evaluator_.reset(new HistEvaluator<GradientSumT, CPUExpandEntry>{
param_, info, this->ctx_->Threads(), column_sampler_, task_});
param_, info, this->ctx_->Threads(), column_sampler_});
monitor_->Stop(__func__);
}