Fuse split evaluation kernels (#8026)
This commit is contained in:
parent
ff1c559084
commit
794cbaa60a
@ -2,6 +2,7 @@
|
|||||||
* Copyright 2020-2022 by XGBoost Contributors
|
* Copyright 2020-2022 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include <algorithm> // std::max
|
#include <algorithm> // std::max
|
||||||
|
#include <vector>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "../../common/categorical.h"
|
#include "../../common/categorical.h"
|
||||||
@ -22,11 +23,10 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan,
|
|||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
bool &missing_left_out) { // NOLINT
|
bool &missing_left_out) { // NOLINT
|
||||||
float parent_gain = CalcGain(param, parent_sum);
|
float parent_gain = CalcGain(param, parent_sum);
|
||||||
float missing_left_gain =
|
float missing_left_gain = evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing),
|
||||||
evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing),
|
GradStats(parent_sum - (scan + missing)));
|
||||||
GradStats(parent_sum - (scan + missing)));
|
float missing_right_gain =
|
||||||
float missing_right_gain = evaluator.CalcSplitGain(
|
evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan));
|
||||||
param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan));
|
|
||||||
|
|
||||||
if (missing_left_gain > missing_right_gain) {
|
if (missing_left_gain > missing_right_gain) {
|
||||||
missing_left_out = true;
|
missing_left_out = true;
|
||||||
@ -47,13 +47,11 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan,
|
|||||||
* \param end
|
* \param end
|
||||||
* \param temp_storage Shared memory for intermediate result.
|
* \param temp_storage Shared memory for intermediate result.
|
||||||
*/
|
*/
|
||||||
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT,
|
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT, typename GradientSumT>
|
||||||
typename GradientSumT>
|
__device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_histogram,
|
||||||
__device__ GradientSumT
|
TempStorageT *temp_storage) {
|
||||||
ReduceFeature(common::Span<const GradientSumT> feature_histogram,
|
|
||||||
TempStorageT* temp_storage) {
|
|
||||||
__shared__ cub::Uninitialized<GradientSumT> uninitialized_sum;
|
__shared__ cub::Uninitialized<GradientSumT> uninitialized_sum;
|
||||||
GradientSumT& shared_sum = uninitialized_sum.Alias();
|
GradientSumT &shared_sum = uninitialized_sum.Alias();
|
||||||
|
|
||||||
GradientSumT local_sum = GradientSumT();
|
GradientSumT local_sum = GradientSumT();
|
||||||
// For loop sums features into one block size
|
// For loop sums features into one block size
|
||||||
@ -78,16 +76,15 @@ ReduceFeature(common::Span<const GradientSumT> feature_histogram,
|
|||||||
template <int BLOCK_THREADS, typename ReduceT, typename ScanT, typename MaxReduceT,
|
template <int BLOCK_THREADS, typename ReduceT, typename ScanT, typename MaxReduceT,
|
||||||
typename TempStorageT, typename GradientSumT, SplitType type>
|
typename TempStorageT, typename GradientSumT, SplitType type>
|
||||||
__device__ void EvaluateFeature(
|
__device__ void EvaluateFeature(
|
||||||
int fidx, EvaluateSplitInputs<GradientSumT> inputs,
|
int fidx, const EvaluateSplitInputs &inputs, const EvaluateSplitSharedInputs &shared_inputs,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<bst_feature_t> sorted_idx, size_t offset,
|
common::Span<bst_feature_t> sorted_idx, size_t offset,
|
||||||
DeviceSplitCandidate *best_split, // shared memory storing best split
|
DeviceSplitCandidate *best_split, // shared memory storing best split
|
||||||
TempStorageT *temp_storage // temp memory for cub operations
|
TempStorageT *temp_storage // temp memory for cub operations
|
||||||
) {
|
) {
|
||||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||||
uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin
|
uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin
|
||||||
uint32_t gidx_end =
|
uint32_t gidx_end = shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature
|
||||||
inputs.feature_segments[fidx + 1]; // end bin for i^th feature
|
|
||||||
auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin);
|
auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin);
|
||||||
|
|
||||||
// Sum histogram bins for current feature
|
// Sum histogram bins for current feature
|
||||||
@ -133,8 +130,8 @@ __device__ void EvaluateFeature(
|
|||||||
bool missing_left = true;
|
bool missing_left = true;
|
||||||
float gain = null_gain;
|
float gain = null_gain;
|
||||||
if (thread_active) {
|
if (thread_active) {
|
||||||
gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, inputs.param,
|
gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum,
|
||||||
inputs.nidx, fidx, evaluator, missing_left);
|
shared_inputs.param, inputs.nidx, fidx, evaluator, missing_left);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -156,40 +153,40 @@ __device__ void EvaluateFeature(
|
|||||||
switch (type) {
|
switch (type) {
|
||||||
case kNum: {
|
case kNum: {
|
||||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||||
uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin
|
uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin
|
||||||
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
||||||
float fvalue;
|
float fvalue;
|
||||||
if (split_gidx < static_cast<int>(gidx_begin)) {
|
if (split_gidx < static_cast<int>(gidx_begin)) {
|
||||||
fvalue = inputs.min_fvalue[fidx];
|
fvalue = shared_inputs.min_fvalue[fidx];
|
||||||
} else {
|
} else {
|
||||||
fvalue = inputs.feature_values[split_gidx];
|
fvalue = shared_inputs.feature_values[split_gidx];
|
||||||
}
|
}
|
||||||
GradientPairPrecise left =
|
GradientPairPrecise left =
|
||||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||||
GradientPairPrecise right = inputs.parent_sum - left;
|
GradientPairPrecise right = inputs.parent_sum - left;
|
||||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
||||||
false, inputs.param);
|
false, shared_inputs.param);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kOneHot: {
|
case kOneHot: {
|
||||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||||
float fvalue = inputs.feature_values[split_gidx];
|
float fvalue = shared_inputs.feature_values[split_gidx];
|
||||||
GradientPairPrecise left =
|
GradientPairPrecise left =
|
||||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||||
GradientPairPrecise right = inputs.parent_sum - left;
|
GradientPairPrecise right = inputs.parent_sum - left;
|
||||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
||||||
true, inputs.param);
|
true, shared_inputs.param);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kPart: {
|
case kPart: {
|
||||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||||
float fvalue = inputs.feature_values[split_gidx];
|
float fvalue = shared_inputs.feature_values[split_gidx];
|
||||||
GradientPairPrecise left =
|
GradientPairPrecise left =
|
||||||
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
|
||||||
GradientPairPrecise right = inputs.parent_sum - left;
|
GradientPairPrecise right = inputs.parent_sum - left;
|
||||||
auto best_thresh = block_max.key; // index of best threshold inside a feature.
|
auto best_thresh = block_max.key; // index of best threshold inside a feature.
|
||||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left,
|
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left,
|
||||||
right, true, inputs.param);
|
right, true, shared_inputs.param);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -199,15 +196,14 @@ __device__ void EvaluateFeature(
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_THREADS, typename GradientSumT>
|
template <int BLOCK_THREADS, typename GradientSumT>
|
||||||
__global__ void EvaluateSplitsKernel(EvaluateSplitInputs<GradientSumT> left,
|
__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(
|
||||||
EvaluateSplitInputs<GradientSumT> right,
|
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
common::Span<bst_feature_t> sorted_idx,
|
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_candidates) {
|
common::Span<DeviceSplitCandidate> out_candidates) {
|
||||||
// KeyValuePair here used as threadIdx.x -> gain_value
|
// KeyValuePair here used as threadIdx.x -> gain_value
|
||||||
using ArgMaxT = cub::KeyValuePair<int, float>;
|
using ArgMaxT = cub::KeyValuePair<int, float>;
|
||||||
using BlockScanT =
|
using BlockScanT = cub::BlockScan<GradientSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>;
|
||||||
cub::BlockScan<GradientSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>;
|
|
||||||
using MaxReduceT = cub::BlockReduce<ArgMaxT, BLOCK_THREADS>;
|
using MaxReduceT = cub::BlockReduce<ArgMaxT, BLOCK_THREADS>;
|
||||||
|
|
||||||
using SumReduceT = cub::BlockReduce<GradientSumT, BLOCK_THREADS>;
|
using SumReduceT = cub::BlockReduce<GradientSumT, BLOCK_THREADS>;
|
||||||
@ -220,7 +216,7 @@ __global__ void EvaluateSplitsKernel(EvaluateSplitInputs<GradientSumT> left,
|
|||||||
|
|
||||||
// Aligned && shared storage for best_split
|
// Aligned && shared storage for best_split
|
||||||
__shared__ cub::Uninitialized<DeviceSplitCandidate> uninitialized_split;
|
__shared__ cub::Uninitialized<DeviceSplitCandidate> uninitialized_split;
|
||||||
DeviceSplitCandidate& best_split = uninitialized_split.Alias();
|
DeviceSplitCandidate &best_split = uninitialized_split.Alias();
|
||||||
__shared__ TempStorage temp_storage;
|
__shared__ TempStorage temp_storage;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
@ -229,30 +225,32 @@ __global__ void EvaluateSplitsKernel(EvaluateSplitInputs<GradientSumT> left,
|
|||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// If this block is working on the left or right node
|
// Allocate blocks to one feature of one node
|
||||||
bool is_left = blockIdx.x < left.feature_set.size();
|
const auto input_idx = blockIdx.x / number_active_features;
|
||||||
EvaluateSplitInputs<GradientSumT>& inputs = is_left ? left : right;
|
const EvaluateSplitInputs &inputs = d_inputs[input_idx];
|
||||||
|
|
||||||
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
||||||
int fidx = inputs.feature_set[is_left ? blockIdx.x
|
|
||||||
: blockIdx.x - left.feature_set.size()];
|
|
||||||
|
|
||||||
if (common::IsCat(inputs.feature_types, fidx)) {
|
int fidx = inputs.feature_set[blockIdx.x % number_active_features];
|
||||||
auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx];
|
|
||||||
if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) {
|
if (common::IsCat(shared_inputs.feature_types, fidx)) {
|
||||||
|
auto n_bins_in_feat =
|
||||||
|
shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx];
|
||||||
|
if (common::UseOneHot(n_bins_in_feat, shared_inputs.param.max_cat_to_onehot)) {
|
||||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
||||||
kOneHot>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
|
kOneHot>(fidx, inputs, shared_inputs, evaluator, sorted_idx, 0, &best_split,
|
||||||
|
&temp_storage);
|
||||||
} else {
|
} else {
|
||||||
auto node_sorted_idx = is_left ? sorted_idx.first(inputs.feature_values.size())
|
auto total_bins = shared_inputs.feature_values.size();
|
||||||
: sorted_idx.last(inputs.feature_values.size());
|
size_t offset = total_bins * input_idx;
|
||||||
size_t offset = is_left ? 0 : inputs.feature_values.size();
|
auto node_sorted_idx = sorted_idx.subspan(offset, total_bins);
|
||||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
||||||
kPart>(fidx, inputs, evaluator, node_sorted_idx, offset, &best_split,
|
kPart>(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset,
|
||||||
&temp_storage);
|
&best_split, &temp_storage);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
|
||||||
kNum>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
|
kNum>(fidx, inputs, shared_inputs, evaluator, sorted_idx, 0, &best_split,
|
||||||
|
&temp_storage);
|
||||||
}
|
}
|
||||||
|
|
||||||
cub::CTA_SYNC();
|
cub::CTA_SYNC();
|
||||||
@ -262,35 +260,34 @@ __global__ void EvaluateSplitsKernel(EvaluateSplitInputs<GradientSumT> left,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,
|
__device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate &a,
|
||||||
const DeviceSplitCandidate& b) {
|
const DeviceSplitCandidate &b) {
|
||||||
return b.loss_chg > a.loss_chg ? b : a;
|
return b.loss_chg > a.loss_chg ? b : a;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Set the bits for categorical splits based on the split threshold.
|
* \brief Set the bits for categorical splits based on the split threshold.
|
||||||
*/
|
*/
|
||||||
template <typename GradientSumT>
|
__device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inputs,
|
||||||
__device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &input,
|
common::Span<bst_feature_t const> d_sorted_idx,
|
||||||
common::Span<bst_feature_t const> d_sorted_idx, bst_feature_t fidx,
|
bst_feature_t fidx, std::size_t input_idx,
|
||||||
bool is_left, common::Span<common::CatBitField::value_type> out,
|
common::Span<common::CatBitField::value_type> out,
|
||||||
DeviceSplitCandidate *p_out_split) {
|
DeviceSplitCandidate *p_out_split) {
|
||||||
auto &out_split = *p_out_split;
|
auto &out_split = *p_out_split;
|
||||||
out_split.split_cats = common::CatBitField{out};
|
out_split.split_cats = common::CatBitField{out};
|
||||||
|
|
||||||
// Simple case for one hot split
|
// Simple case for one hot split
|
||||||
if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) {
|
if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
|
||||||
out_split.split_cats.Set(common::AsCat(out_split.fvalue));
|
out_split.split_cats.Set(common::AsCat(out_split.fvalue));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto node_sorted_idx =
|
auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx,
|
||||||
is_left ? d_sorted_idx.subspan(0, input.feature_values.size())
|
shared_inputs.feature_values.size());
|
||||||
: d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size());
|
size_t node_offset = input_idx * shared_inputs.feature_values.size();
|
||||||
size_t node_offset = is_left ? 0 : input.feature_values.size();
|
|
||||||
auto best_thresh = out_split.PopBestThresh();
|
auto best_thresh = out_split.PopBestThresh();
|
||||||
auto f_sorted_idx =
|
auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx],
|
||||||
node_sorted_idx.subspan(input.feature_segments[fidx], input.FeatureBins(fidx));
|
shared_inputs.FeatureBins(fidx));
|
||||||
if (out_split.dir != kLeftDir) {
|
if (out_split.dir != kLeftDir) {
|
||||||
// forward, missing on right
|
// forward, missing on right
|
||||||
auto beg = dh::tcbegin(f_sorted_idx);
|
auto beg = dh::tcbegin(f_sorted_idx);
|
||||||
@ -299,7 +296,7 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &inp
|
|||||||
boundary = std::max(boundary, static_cast<size_t>(1ul));
|
boundary = std::max(boundary, static_cast<size_t>(1ul));
|
||||||
auto end = beg + boundary;
|
auto end = beg + boundary;
|
||||||
thrust::for_each(thrust::seq, beg, end, [&](auto c) {
|
thrust::for_each(thrust::seq, beg, end, [&](auto c) {
|
||||||
auto cat = input.feature_values[c - node_offset];
|
auto cat = shared_inputs.feature_values[c - node_offset];
|
||||||
assert(!out_split.split_cats.Check(cat) && "already set");
|
assert(!out_split.split_cats.Check(cat) && "already set");
|
||||||
out_split.SetCat(cat);
|
out_split.SetCat(cat);
|
||||||
});
|
});
|
||||||
@ -307,44 +304,36 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &inp
|
|||||||
assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
|
assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
|
||||||
thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
|
thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
|
||||||
dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
|
dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
|
||||||
auto cat = input.feature_values[c - node_offset];
|
auto cat = shared_inputs.feature_values[c - node_offset];
|
||||||
out_split.SetCat(cat);
|
out_split.SetCat(cat);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
|
void GPUHistEvaluator<GradientSumT>::LaunchEvaluateSplits(
|
||||||
EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right,
|
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_splits) {
|
common::Span<DeviceSplitCandidate> out_splits) {
|
||||||
if (need_sort_histogram_) {
|
if (need_sort_histogram_) {
|
||||||
this->SortHistogram(left, right, evaluator);
|
this->SortHistogram(d_inputs, shared_inputs, evaluator);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t combined_num_features = left.feature_set.size() + right.feature_set.size();
|
size_t combined_num_features = number_active_features * d_inputs.size();
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
|
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
|
||||||
|
|
||||||
// One block for each feature
|
// One block for each feature
|
||||||
uint32_t constexpr kBlockThreads = 256;
|
uint32_t constexpr kBlockThreads = 32;
|
||||||
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
|
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
|
||||||
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, this->SortedIdx(left),
|
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, number_active_features, d_inputs,
|
||||||
|
shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
|
||||||
evaluator, dh::ToSpan(feature_best_splits));
|
evaluator, dh::ToSpan(feature_best_splits));
|
||||||
|
|
||||||
// Reduce to get best candidate for left and right child over all features
|
// Reduce to get best candidate for left and right child over all features
|
||||||
auto reduce_offset = dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
|
auto reduce_offset = dh::MakeTransformIterator<size_t>(
|
||||||
[=] __device__(size_t idx) -> size_t {
|
thrust::make_counting_iterator(0llu),
|
||||||
if (idx == 0) {
|
[=] __device__(size_t idx) -> size_t { return idx * number_active_features; });
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (idx == 1) {
|
|
||||||
return left.feature_set.size();
|
|
||||||
}
|
|
||||||
if (idx == 2) {
|
|
||||||
return combined_num_features;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
});
|
|
||||||
size_t temp_storage_bytes = 0;
|
size_t temp_storage_bytes = 0;
|
||||||
auto num_segments = out_splits.size();
|
auto num_segments = out_splits.size();
|
||||||
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
||||||
@ -357,89 +346,73 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT> const &input,
|
void GPUHistEvaluator<GradientSumT>::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
||||||
common::Span<CatST> cats_out) {
|
if (!has_categoricals_) return;
|
||||||
if (cats_out.empty()) return;
|
auto d_cats = this->DeviceCatStorage(nidx);
|
||||||
|
auto h_cats = this->HostCatStorage(nidx);
|
||||||
dh::CUDAEvent event;
|
dh::CUDAEvent event;
|
||||||
event.Record(dh::DefaultStream());
|
event.Record(dh::DefaultStream());
|
||||||
auto h_cats = this->HostCatStorage(input.nidx);
|
for (auto idx : nidx) {
|
||||||
copy_stream_.View().Wait(event);
|
copy_stream_.View().Wait(event);
|
||||||
dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
cudaMemcpyDeviceToHost, copy_stream_.View()));
|
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||||
|
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate,
|
void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
|
||||||
EvaluateSplitInputs<GradientSumT> left,
|
const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features,
|
||||||
EvaluateSplitInputs<GradientSumT> right,
|
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
|
||||||
common::Span<GPUExpandEntry> out_entries) {
|
common::Span<GPUExpandEntry> out_entries) {
|
||||||
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
||||||
|
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(2);
|
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
|
||||||
auto out_splits = dh::ToSpan(splits_out_storage);
|
auto out_splits = dh::ToSpan(splits_out_storage);
|
||||||
this->EvaluateSplits(left, right, evaluator, out_splits);
|
this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator,
|
||||||
|
out_splits);
|
||||||
|
|
||||||
auto d_sorted_idx = this->SortedIdx(left);
|
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||||
auto d_entries = out_entries;
|
auto d_entries = out_entries;
|
||||||
auto cats_out = this->DeviceCatStorage(left.nidx);
|
auto device_cats_accessor = this->DeviceCatStorage(nidx);
|
||||||
// turn candidate into entry, along with handling sort based split.
|
// turn candidate into entry, along with handling sort based split.
|
||||||
dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) {
|
dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
|
||||||
auto const &input = i == 0 ? left : right;
|
auto const input = d_inputs[i];
|
||||||
auto &split = out_splits[i];
|
auto &split = out_splits[i];
|
||||||
auto fidx = out_splits[i].findex;
|
auto fidx = out_splits[i].findex;
|
||||||
|
|
||||||
if (split.is_cat) {
|
if (split.is_cat) {
|
||||||
bool is_left = i == 0;
|
SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
|
||||||
auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2);
|
device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
|
||||||
SetCategoricalSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float base_weight =
|
float base_weight = evaluator.CalcWeight(input.nidx, shared_inputs.param,
|
||||||
evaluator.CalcWeight(input.nidx, input.param, GradStats{split.left_sum + split.right_sum});
|
GradStats{split.left_sum + split.right_sum});
|
||||||
float left_weight = evaluator.CalcWeight(input.nidx, input.param, GradStats{split.left_sum});
|
float left_weight =
|
||||||
float right_weight = evaluator.CalcWeight(input.nidx, input.param, GradStats{split.right_sum});
|
evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.left_sum});
|
||||||
|
float right_weight =
|
||||||
|
evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.right_sum});
|
||||||
|
|
||||||
d_entries[i] = GPUExpandEntry{input.nidx, candidate.depth + 1, out_splits[i],
|
d_entries[i] = GPUExpandEntry{input.nidx, input.depth, out_splits[i],
|
||||||
base_weight, left_weight, right_weight};
|
base_weight, left_weight, right_weight};
|
||||||
});
|
});
|
||||||
|
|
||||||
this->CopyToHost(left, cats_out);
|
this->CopyToHost(nidx);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
|
GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
|
||||||
EvaluateSplitInputs<GradientSumT> input, float weight) {
|
EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs, float weight) {
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(1);
|
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
|
||||||
auto out_split = dh::ToSpan(splits_out);
|
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
|
||||||
auto evaluator = tree_evaluator_.GetEvaluator<GPUTrainingParam>();
|
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
|
||||||
this->EvaluateSplits(input, {}, evaluator, out_split);
|
dh::ToSpan(out_entries));
|
||||||
|
|
||||||
auto cats_out = this->DeviceCatStorage(input.nidx);
|
|
||||||
auto d_sorted_idx = this->SortedIdx(input);
|
|
||||||
|
|
||||||
dh::TemporaryArray<GPUExpandEntry> entries(1);
|
|
||||||
auto d_entries = entries.data().get();
|
|
||||||
dh::LaunchN(1, [=] __device__(size_t i) {
|
|
||||||
auto &split = out_split[i];
|
|
||||||
auto fidx = out_split[i].findex;
|
|
||||||
|
|
||||||
if (split.is_cat) {
|
|
||||||
SetCategoricalSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum});
|
|
||||||
float right_weight = evaluator.CalcWeight(0, input.param, GradStats{split.right_sum});
|
|
||||||
d_entries[0] = GPUExpandEntry(0, 0, split, weight, left_weight, right_weight);
|
|
||||||
});
|
|
||||||
this->CopyToHost(input, cats_out);
|
|
||||||
|
|
||||||
GPUExpandEntry root_entry;
|
GPUExpandEntry root_entry;
|
||||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, entries.data().get(),
|
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||||
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
return root_entry;
|
return root_entry;
|
||||||
}
|
}
|
||||||
|
|
||||||
template class GPUHistEvaluator<GradientPair>;
|
|
||||||
template class GPUHistEvaluator<GradientPairPrecise>;
|
template class GPUHistEvaluator<GradientPairPrecise>;
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -17,24 +17,40 @@ class HistogramCuts;
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace tree {
|
namespace tree {
|
||||||
template <typename GradientSumT>
|
|
||||||
|
// Inputs specific to each node
|
||||||
struct EvaluateSplitInputs {
|
struct EvaluateSplitInputs {
|
||||||
int nidx;
|
int nidx;
|
||||||
|
int depth;
|
||||||
GradientPairPrecise parent_sum;
|
GradientPairPrecise parent_sum;
|
||||||
GPUTrainingParam param;
|
|
||||||
common::Span<const bst_feature_t> feature_set;
|
common::Span<const bst_feature_t> feature_set;
|
||||||
|
common::Span<const GradientPairPrecise> gradient_histogram;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Inputs necessary for all nodes
|
||||||
|
struct EvaluateSplitSharedInputs {
|
||||||
|
GPUTrainingParam param;
|
||||||
common::Span<FeatureType const> feature_types;
|
common::Span<FeatureType const> feature_types;
|
||||||
common::Span<const uint32_t> feature_segments;
|
common::Span<const uint32_t> feature_segments;
|
||||||
common::Span<const float> feature_values;
|
common::Span<const float> feature_values;
|
||||||
common::Span<const float> min_fvalue;
|
common::Span<const float> min_fvalue;
|
||||||
common::Span<const GradientSumT> gradient_histogram;
|
|
||||||
|
|
||||||
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
|
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
|
||||||
__device__ auto FeatureBins(bst_feature_t fidx) const {
|
__device__ auto FeatureBins(bst_feature_t fidx) const {
|
||||||
return feature_segments[fidx + 1] - feature_segments[fidx];
|
return feature_segments[fidx + 1] - feature_segments[fidx];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Used to return internal storage regions for categoricals
|
||||||
|
// Usable on device
|
||||||
|
struct CatAccessor {
|
||||||
|
common::Span<common::CatBitField::value_type> cat_storage;
|
||||||
|
std::size_t node_categorical_storage_size;
|
||||||
|
XGBOOST_DEVICE common::Span<common::CatBitField::value_type> GetNodeCatStorage(bst_node_t nidx) {
|
||||||
|
return this->cat_storage.subspan(nidx * this->node_categorical_storage_size,
|
||||||
|
this->node_categorical_storage_size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
class GPUHistEvaluator {
|
class GPUHistEvaluator {
|
||||||
using CatST = common::CatBitField::value_type; // categorical storage type
|
using CatST = common::CatBitField::value_type; // categorical storage type
|
||||||
@ -61,61 +77,53 @@ class GPUHistEvaluator {
|
|||||||
// Do we have any categorical features that require sorting histograms?
|
// Do we have any categorical features that require sorting histograms?
|
||||||
// use this to skip the expensive sort step
|
// use this to skip the expensive sort step
|
||||||
bool need_sort_histogram_ = false;
|
bool need_sort_histogram_ = false;
|
||||||
|
bool has_categoricals_ = false;
|
||||||
// Number of elements of categorical storage type
|
// Number of elements of categorical storage type
|
||||||
// needed to hold categoricals for a single mode
|
// needed to hold categoricals for a single mode
|
||||||
std::size_t node_categorical_storage_size_ = 0;
|
std::size_t node_categorical_storage_size_ = 0;
|
||||||
|
|
||||||
// Copy the categories from device to host asynchronously.
|
// Copy the categories from device to host asynchronously.
|
||||||
void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
|
void CopyToHost( const std::vector<bst_node_t>& nidx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Get host category storage of nidx for internal calculation.
|
* \brief Get host category storage of nidx for internal calculation.
|
||||||
*/
|
*/
|
||||||
auto HostCatStorage(bst_node_t nidx) {
|
auto HostCatStorage(const std::vector<bst_node_t> &nidx) {
|
||||||
|
if (!has_categoricals_) return CatAccessor{};
|
||||||
std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
|
auto max_nidx = *std::max_element(nidx.begin(), nidx.end());
|
||||||
if(h_split_cats_.size()<min_size){
|
std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_;
|
||||||
|
if (h_split_cats_.size() < min_size) {
|
||||||
h_split_cats_.resize(min_size);
|
h_split_cats_.resize(min_size);
|
||||||
}
|
}
|
||||||
|
return CatAccessor{{h_split_cats_.data(), h_split_cats_.size()},
|
||||||
if (nidx == RegTree::kRoot) {
|
node_categorical_storage_size_};
|
||||||
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
|
|
||||||
return cats_out;
|
|
||||||
}
|
|
||||||
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
|
|
||||||
return cats_out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Get device category storage of nidx for internal calculation.
|
* \brief Get device category storage of nidx for internal calculation.
|
||||||
*/
|
*/
|
||||||
auto DeviceCatStorage(bst_node_t nidx) {
|
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
|
||||||
std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
|
if (!has_categoricals_) return CatAccessor{};
|
||||||
if(split_cats_.size()<min_size){
|
auto max_nidx = *std::max_element(nidx.begin(), nidx.end());
|
||||||
|
std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_;
|
||||||
|
if (split_cats_.size() < min_size) {
|
||||||
split_cats_.resize(min_size);
|
split_cats_.resize(min_size);
|
||||||
}
|
}
|
||||||
if (nidx == RegTree::kRoot) {
|
return CatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_};
|
||||||
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
|
|
||||||
return cats_out;
|
|
||||||
}
|
|
||||||
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
|
|
||||||
return cats_out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Get sorted index storage based on the left node of inputs.
|
* \brief Get sorted index storage based on the left node of inputs.
|
||||||
*/
|
*/
|
||||||
auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
|
auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
|
||||||
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
|
if(!need_sort_histogram_) return common::Span<bst_feature_t>();
|
||||||
return dh::ToSpan(cat_sorted_idx_).first(left.feature_values.size());
|
cat_sorted_idx_.resize(num_nodes * total_bins);
|
||||||
}
|
|
||||||
return dh::ToSpan(cat_sorted_idx_);
|
return dh::ToSpan(cat_sorted_idx_);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto SortInput(EvaluateSplitInputs<GradientSumT> left) {
|
auto SortInput(int num_nodes, bst_feature_t total_bins) {
|
||||||
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
|
if(!need_sort_histogram_) return common::Span<SortPair>();
|
||||||
return dh::ToSpan(sort_input_).first(left.feature_values.size());
|
sort_input_.resize(num_nodes * total_bins);
|
||||||
}
|
|
||||||
return dh::ToSpan(sort_input_);
|
return dh::ToSpan(sort_input_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,26 +162,24 @@ class GPUHistEvaluator {
|
|||||||
/**
|
/**
|
||||||
* \brief Sort the histogram based on output to obtain contiguous partitions.
|
* \brief Sort the histogram based on output to obtain contiguous partitions.
|
||||||
*/
|
*/
|
||||||
common::Span<bst_feature_t const> SortHistogram(
|
common::Span<bst_feature_t const> SortHistogram(common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
||||||
|
|
||||||
// impl of evaluate splits, contains CUDA kernels so it's public
|
// impl of evaluate splits, contains CUDA kernels so it's public
|
||||||
void EvaluateSplits(EvaluateSplitInputs<GradientSumT> left,
|
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
|
||||||
EvaluateSplitInputs<GradientSumT> right,
|
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_splits);
|
common::Span<DeviceSplitCandidate> out_splits);
|
||||||
/**
|
/**
|
||||||
* \brief Evaluate splits for left and right nodes.
|
* \brief Evaluate splits for left and right nodes.
|
||||||
*/
|
*/
|
||||||
void EvaluateSplits(GPUExpandEntry candidate,
|
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
EvaluateSplitInputs<GradientSumT> left,
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
EvaluateSplitInputs<GradientSumT> right,
|
|
||||||
common::Span<GPUExpandEntry> out_splits);
|
common::Span<GPUExpandEntry> out_splits);
|
||||||
/**
|
/**
|
||||||
* \brief Evaluate splits for root node.
|
* \brief Evaluate splits for root node.
|
||||||
*/
|
*/
|
||||||
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs<GradientSumT> input, float weight);
|
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,EvaluateSplitSharedInputs shared_inputs, float weight);
|
||||||
};
|
};
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -21,6 +21,7 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
|
|||||||
int32_t device) {
|
int32_t device) {
|
||||||
param_ = param;
|
param_ = param;
|
||||||
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
||||||
|
has_categoricals_ = cuts.HasCategorical();
|
||||||
if (cuts.HasCategorical()) {
|
if (cuts.HasCategorical()) {
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
|
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
|
||||||
@ -69,42 +70,46 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
|
|||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
|
common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
|
||||||
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
|
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
auto sorted_idx = this->SortedIdx(left);
|
auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||||
dh::Iota(sorted_idx);
|
dh::Iota(sorted_idx);
|
||||||
auto data = this->SortInput(left);
|
auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
|
||||||
auto it = thrust::make_counting_iterator(0u);
|
auto it = thrust::make_counting_iterator(0u);
|
||||||
auto d_feature_idx = dh::ToSpan(feature_idx_);
|
auto d_feature_idx = dh::ToSpan(feature_idx_);
|
||||||
|
auto total_bins = shared_inputs.feature_values.size();
|
||||||
thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data),
|
thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data),
|
||||||
[=] XGBOOST_DEVICE(uint32_t i) {
|
[=] XGBOOST_DEVICE(uint32_t i) {
|
||||||
auto is_left = i < left.feature_values.size();
|
auto const &input = d_inputs[i / total_bins];
|
||||||
auto const &input = is_left ? left : right;
|
auto j = i % total_bins;
|
||||||
auto j = i - (is_left ? 0 : input.feature_values.size());
|
|
||||||
auto fidx = d_feature_idx[j];
|
auto fidx = d_feature_idx[j];
|
||||||
if (common::IsCat(input.feature_types, fidx)) {
|
if (common::IsCat(shared_inputs.feature_types, fidx)) {
|
||||||
auto lw = evaluator.CalcWeightCat(input.param, input.gradient_histogram[j]);
|
auto lw = evaluator.CalcWeightCat(shared_inputs.param,
|
||||||
|
input.gradient_histogram[j]);
|
||||||
return thrust::make_tuple(i, lw);
|
return thrust::make_tuple(i, lw);
|
||||||
}
|
}
|
||||||
return thrust::make_tuple(i, 0.0);
|
return thrust::make_tuple(i, 0.0);
|
||||||
});
|
});
|
||||||
|
// Sort an array segmented according to
|
||||||
|
// - nodes
|
||||||
|
// - features within each node
|
||||||
|
// - gradients within each feature
|
||||||
thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data),
|
thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data),
|
||||||
dh::tbegin(sorted_idx),
|
dh::tbegin(sorted_idx),
|
||||||
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
|
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
|
||||||
auto li = thrust::get<0>(l);
|
auto li = thrust::get<0>(l);
|
||||||
auto ri = thrust::get<0>(r);
|
auto ri = thrust::get<0>(r);
|
||||||
|
|
||||||
auto l_is_left = li < left.feature_values.size();
|
auto l_node = li / total_bins;
|
||||||
auto r_is_left = ri < left.feature_values.size();
|
auto r_node = ri / total_bins;
|
||||||
|
|
||||||
if (l_is_left != r_is_left) {
|
if (l_node != r_node) {
|
||||||
return l_is_left; // not the same node
|
return l_node < r_node; // not the same node
|
||||||
}
|
}
|
||||||
|
|
||||||
auto const &input = l_is_left ? left : right;
|
li = li % total_bins;
|
||||||
li -= (l_is_left ? 0 : input.feature_values.size());
|
ri = ri % total_bins;
|
||||||
ri -= (r_is_left ? 0 : input.feature_values.size());
|
|
||||||
|
|
||||||
auto lfidx = d_feature_idx[li];
|
auto lfidx = d_feature_idx[li];
|
||||||
auto rfidx = d_feature_idx[ri];
|
auto rfidx = d_feature_idx[ri];
|
||||||
@ -113,7 +118,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
|
|||||||
return lfidx < rfidx; // not the same feature
|
return lfidx < rfidx; // not the same feature
|
||||||
}
|
}
|
||||||
|
|
||||||
if (common::IsCat(input.feature_types, lfidx)) {
|
if (common::IsCat(shared_inputs.feature_types, lfidx)) {
|
||||||
auto lw = thrust::get<1>(l);
|
auto lw = thrust::get<1>(l);
|
||||||
auto rw = thrust::get<1>(r);
|
auto rw = thrust::get<1>(r);
|
||||||
return lw < rw;
|
return lw < rw;
|
||||||
|
|||||||
@ -196,6 +196,7 @@ struct GPUHistMakerDevice {
|
|||||||
HistRounding<GradientSumT> histogram_rounding;
|
HistRounding<GradientSumT> histogram_rounding;
|
||||||
|
|
||||||
dh::PinnedMemory pinned;
|
dh::PinnedMemory pinned;
|
||||||
|
dh::PinnedMemory pinned2;
|
||||||
|
|
||||||
common::Monitor monitor;
|
common::Monitor monitor;
|
||||||
common::ColumnSampler column_sampler;
|
common::ColumnSampler column_sampler;
|
||||||
@ -279,58 +280,64 @@ struct GPUHistMakerDevice {
|
|||||||
common::Span<bst_feature_t> feature_set =
|
common::Span<bst_feature_t> feature_set =
|
||||||
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
||||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||||
EvaluateSplitInputs<GradientSumT> inputs{nidx,
|
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
|
||||||
root_sum,
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
gpu_param,
|
gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map,
|
||||||
feature_set,
|
matrix.min_fvalue,
|
||||||
feature_types,
|
};
|
||||||
matrix.feature_segments,
|
auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs, weight);
|
||||||
matrix.gidx_fvalue_map,
|
|
||||||
matrix.min_fvalue,
|
|
||||||
hist.GetNodeHistogram(nidx)};
|
|
||||||
auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight);
|
|
||||||
return split;
|
return split;
|
||||||
}
|
}
|
||||||
|
|
||||||
void EvaluateLeftRightSplits(GPUExpandEntry candidate, int left_nidx, int right_nidx,
|
void EvaluateSplits(const std::vector<GPUExpandEntry>& candidates, const RegTree& tree,
|
||||||
const RegTree& tree,
|
|
||||||
common::Span<GPUExpandEntry> pinned_candidates_out) {
|
common::Span<GPUExpandEntry> pinned_candidates_out) {
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
|
if (candidates.empty()) return;
|
||||||
GPUTrainingParam gpu_param(param);
|
dh::TemporaryArray<EvaluateSplitInputs> d_node_inputs(2 * candidates.size());
|
||||||
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
|
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
|
||||||
left_sampled_features->SetDevice(ctx_->gpu_id);
|
std::vector<bst_node_t> nidx(2 * candidates.size());
|
||||||
common::Span<bst_feature_t> left_feature_set =
|
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
|
||||||
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
|
|
||||||
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
|
||||||
right_sampled_features->SetDevice(ctx_->gpu_id);
|
|
||||||
common::Span<bst_feature_t> right_feature_set =
|
|
||||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
|
|
||||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
GPUTrainingParam(param), feature_types, matrix.feature_segments,
|
||||||
|
matrix.gidx_fvalue_map, matrix.min_fvalue,
|
||||||
|
};
|
||||||
|
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
|
||||||
|
for (int i = 0; i < candidates.size(); i++) {
|
||||||
|
auto candidate = candidates.at(i);
|
||||||
|
int left_nidx = tree[candidate.nid].LeftChild();
|
||||||
|
int right_nidx = tree[candidate.nid].RightChild();
|
||||||
|
nidx[i * 2] = left_nidx;
|
||||||
|
nidx[i * 2 + 1] = right_nidx;
|
||||||
|
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
|
||||||
|
left_sampled_features->SetDevice(ctx_->gpu_id);
|
||||||
|
common::Span<bst_feature_t> left_feature_set =
|
||||||
|
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
|
||||||
|
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||||
|
right_sampled_features->SetDevice(ctx_->gpu_id);
|
||||||
|
common::Span<bst_feature_t> right_feature_set =
|
||||||
|
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
|
||||||
|
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
|
||||||
|
left_feature_set, hist.GetNodeHistogram(left_nidx)};
|
||||||
|
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
|
||||||
|
right_feature_set, hist.GetNodeHistogram(right_nidx)};
|
||||||
|
}
|
||||||
|
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
|
||||||
|
for (auto input : h_node_inputs) {
|
||||||
|
CHECK_EQ(input.feature_set.size(), number_active_features)
|
||||||
|
<< "Current implementation assumes that the number of active features "
|
||||||
|
"(after sampling) in any node is the same";
|
||||||
|
}
|
||||||
|
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
|
||||||
|
h_node_inputs.size() * sizeof(EvaluateSplitInputs),
|
||||||
|
cudaMemcpyDefault));
|
||||||
|
|
||||||
EvaluateSplitInputs<GradientSumT> left{left_nidx,
|
this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
|
||||||
candidate.split.left_sum,
|
shared_inputs, dh::ToSpan(entries));
|
||||||
gpu_param,
|
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
||||||
left_feature_set,
|
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||||
feature_types,
|
cudaMemcpyDeviceToHost));
|
||||||
matrix.feature_segments,
|
dh::DefaultStream().Sync();
|
||||||
matrix.gidx_fvalue_map,
|
}
|
||||||
matrix.min_fvalue,
|
|
||||||
hist.GetNodeHistogram(left_nidx)};
|
|
||||||
EvaluateSplitInputs<GradientSumT> right{right_nidx,
|
|
||||||
candidate.split.right_sum,
|
|
||||||
gpu_param,
|
|
||||||
right_feature_set,
|
|
||||||
feature_types,
|
|
||||||
matrix.feature_segments,
|
|
||||||
matrix.gidx_fvalue_map,
|
|
||||||
matrix.min_fvalue,
|
|
||||||
hist.GetNodeHistogram(right_nidx)};
|
|
||||||
|
|
||||||
dh::TemporaryArray<GPUExpandEntry> entries(2);
|
|
||||||
this->evaluator_.EvaluateSplits(candidate, left, right, dh::ToSpan(entries));
|
|
||||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(),
|
|
||||||
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
|
|
||||||
}
|
|
||||||
|
|
||||||
void BuildHist(int nidx) {
|
void BuildHist(int nidx) {
|
||||||
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
||||||
@ -697,16 +704,9 @@ struct GPUHistMakerDevice {
|
|||||||
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
|
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
|
||||||
monitor.Stop("BuildHist");
|
monitor.Stop("BuildHist");
|
||||||
|
|
||||||
for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
|
monitor.Start("EvaluateSplits");
|
||||||
auto candidate = filtered_expand_set.at(i);
|
this->EvaluateSplits(filtered_expand_set, *p_tree, new_candidates);
|
||||||
int left_child_nidx = tree[candidate.nid].LeftChild();
|
monitor.Stop("EvaluateSplits");
|
||||||
int right_child_nidx = tree[candidate.nid].RightChild();
|
|
||||||
|
|
||||||
monitor.Start("EvaluateSplits");
|
|
||||||
this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree,
|
|
||||||
new_candidates.subspan(i * 2, 2));
|
|
||||||
monitor.Stop("EvaluateSplits");
|
|
||||||
}
|
|
||||||
dh::DefaultStream().Sync();
|
dh::DefaultStream().Sync();
|
||||||
driver.Push(new_candidates.begin(), new_candidates.end());
|
driver.Push(new_candidates.begin(), new_candidates.end());
|
||||||
expand_set = driver.Pop();
|
expand_set = driver.Pop();
|
||||||
|
|||||||
@ -35,8 +35,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
|
|||||||
std::vector<bst_feature_t>{0, 1};
|
std::vector<bst_feature_t>{0, 1};
|
||||||
|
|
||||||
// Setup gradients so that second feature gets higher gain
|
// Setup gradients so that second feature gets higher gain
|
||||||
thrust::device_vector<GradientPair> feature_histogram =
|
thrust::device_vector<GradientPairPrecise> feature_histogram =
|
||||||
std::vector<GradientPair>{
|
std::vector<GradientPairPrecise>{
|
||||||
{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}};
|
{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}};
|
||||||
|
|
||||||
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
|
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
|
||||||
@ -50,21 +50,23 @@ void TestEvaluateSingleSplit(bool is_categorical) {
|
|||||||
d_feature_types = dh::ToSpan(feature_types);
|
d_feature_types = dh::ToSpan(feature_types);
|
||||||
}
|
}
|
||||||
|
|
||||||
EvaluateSplitInputs<GradientPair> input{1,
|
EvaluateSplitInputs input{1,0,
|
||||||
parent_sum,
|
parent_sum,
|
||||||
param,
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
d_feature_types,
|
|
||||||
cuts.cut_ptrs_.ConstDeviceSpan(),
|
|
||||||
cuts.cut_values_.ConstDeviceSpan(),
|
|
||||||
cuts.min_vals_.ConstDeviceSpan(),
|
|
||||||
dh::ToSpan(feature_histogram)};
|
dh::ToSpan(feature_histogram)};
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
param,
|
||||||
|
d_feature_types,
|
||||||
|
cuts.cut_ptrs_.ConstDeviceSpan(),
|
||||||
|
cuts.cut_values_.ConstDeviceSpan(),
|
||||||
|
cuts.min_vals_.ConstDeviceSpan(),
|
||||||
|
};
|
||||||
|
|
||||||
GPUHistEvaluator<GradientPair> evaluator{
|
GPUHistEvaluator<GradientPairPrecise> evaluator{
|
||||||
tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
|
tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
|
||||||
evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
|
evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
|
||||||
DeviceSplitCandidate result =
|
DeviceSplitCandidate result =
|
||||||
evaluator.EvaluateSingleSplit(input, 0).split;
|
evaluator.EvaluateSingleSplit(input, shared_inputs,0).split;
|
||||||
|
|
||||||
EXPECT_EQ(result.findex, 1);
|
EXPECT_EQ(result.findex, 1);
|
||||||
EXPECT_EQ(result.fvalue, 11.0);
|
EXPECT_EQ(result.fvalue, 11.0);
|
||||||
@ -93,21 +95,23 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
|
|||||||
std::vector<bst_row_t>{0, 2};
|
std::vector<bst_row_t>{0, 2};
|
||||||
thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
|
thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
|
||||||
thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
|
thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
|
||||||
thrust::device_vector<GradientPair> feature_histogram =
|
thrust::device_vector<GradientPairPrecise> feature_histogram =
|
||||||
std::vector<GradientPair>{{-0.5, 0.5}, {0.5, 0.5}};
|
std::vector<GradientPairPrecise>{{-0.5, 0.5}, {0.5, 0.5}};
|
||||||
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
|
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
|
||||||
EvaluateSplitInputs<GradientPair> input{1,
|
EvaluateSplitInputs input{1,0,
|
||||||
parent_sum,
|
parent_sum,
|
||||||
param,
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
{},
|
|
||||||
dh::ToSpan(feature_segments),
|
|
||||||
dh::ToSpan(feature_values),
|
|
||||||
dh::ToSpan(feature_min_values),
|
|
||||||
dh::ToSpan(feature_histogram)};
|
dh::ToSpan(feature_histogram)};
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
param,
|
||||||
|
{},
|
||||||
|
dh::ToSpan(feature_segments),
|
||||||
|
dh::ToSpan(feature_values),
|
||||||
|
dh::ToSpan(feature_min_values),
|
||||||
|
};
|
||||||
|
|
||||||
GPUHistEvaluator<GradientPair> evaluator(tparam, feature_set.size(), 0);
|
GPUHistEvaluator<GradientPairPrecise> evaluator(tparam, feature_set.size(), 0);
|
||||||
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0).split;
|
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs,0).split;
|
||||||
|
|
||||||
EXPECT_EQ(result.findex, 0);
|
EXPECT_EQ(result.findex, 0);
|
||||||
EXPECT_EQ(result.fvalue, 1.0);
|
EXPECT_EQ(result.fvalue, 1.0);
|
||||||
@ -118,9 +122,9 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
|
|||||||
|
|
||||||
TEST(GpuHist, EvaluateSingleSplitEmpty) {
|
TEST(GpuHist, EvaluateSingleSplitEmpty) {
|
||||||
TrainParam tparam = ZeroParam();
|
TrainParam tparam = ZeroParam();
|
||||||
GPUHistEvaluator<GradientPair> evaluator(tparam, 1, 0);
|
GPUHistEvaluator<GradientPairPrecise> evaluator(tparam, 1, 0);
|
||||||
DeviceSplitCandidate result =
|
DeviceSplitCandidate result =
|
||||||
evaluator.EvaluateSingleSplit(EvaluateSplitInputs<GradientPair>{}, 0).split;
|
evaluator.EvaluateSingleSplit(EvaluateSplitInputs{}, EvaluateSplitSharedInputs{}, 0).split;
|
||||||
EXPECT_EQ(result.findex, -1);
|
EXPECT_EQ(result.findex, -1);
|
||||||
EXPECT_LT(result.loss_chg, 0.0f);
|
EXPECT_LT(result.loss_chg, 0.0f);
|
||||||
}
|
}
|
||||||
@ -140,22 +144,24 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
|
|||||||
std::vector<float>{1.0, 2.0, 11.0, 12.0};
|
std::vector<float>{1.0, 2.0, 11.0, 12.0};
|
||||||
thrust::device_vector<float> feature_min_values =
|
thrust::device_vector<float> feature_min_values =
|
||||||
std::vector<float>{0.0, 10.0};
|
std::vector<float>{0.0, 10.0};
|
||||||
thrust::device_vector<GradientPair> feature_histogram =
|
thrust::device_vector<GradientPairPrecise> feature_histogram =
|
||||||
std::vector<GradientPair>{
|
std::vector<GradientPairPrecise>{
|
||||||
{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
|
{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
|
||||||
thrust::device_vector<int> monotonic_constraints(2, 0);
|
thrust::device_vector<int> monotonic_constraints(2, 0);
|
||||||
EvaluateSplitInputs<GradientPair> input{1,
|
EvaluateSplitInputs input{1,0,
|
||||||
parent_sum,
|
parent_sum,
|
||||||
param,
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
|
dh::ToSpan(feature_histogram)};
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
param,
|
||||||
{},
|
{},
|
||||||
dh::ToSpan(feature_segments),
|
dh::ToSpan(feature_segments),
|
||||||
dh::ToSpan(feature_values),
|
dh::ToSpan(feature_values),
|
||||||
dh::ToSpan(feature_min_values),
|
dh::ToSpan(feature_min_values),
|
||||||
dh::ToSpan(feature_histogram)};
|
};
|
||||||
|
|
||||||
GPUHistEvaluator<GradientPair> evaluator(tparam, feature_min_values.size(), 0);
|
GPUHistEvaluator<GradientPairPrecise> evaluator(tparam, feature_min_values.size(), 0);
|
||||||
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0).split;
|
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs, 0).split;
|
||||||
|
|
||||||
EXPECT_EQ(result.findex, 1);
|
EXPECT_EQ(result.findex, 1);
|
||||||
EXPECT_EQ(result.fvalue, 11.0);
|
EXPECT_EQ(result.fvalue, 11.0);
|
||||||
@ -178,22 +184,24 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
|
|||||||
std::vector<float>{1.0, 2.0, 11.0, 12.0};
|
std::vector<float>{1.0, 2.0, 11.0, 12.0};
|
||||||
thrust::device_vector<float> feature_min_values =
|
thrust::device_vector<float> feature_min_values =
|
||||||
std::vector<float>{0.0, 10.0};
|
std::vector<float>{0.0, 10.0};
|
||||||
thrust::device_vector<GradientPair> feature_histogram =
|
thrust::device_vector<GradientPairPrecise> feature_histogram =
|
||||||
std::vector<GradientPair>{
|
std::vector<GradientPairPrecise>{
|
||||||
{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
|
{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
|
||||||
thrust::device_vector<int> monotonic_constraints(2, 0);
|
thrust::device_vector<int> monotonic_constraints(2, 0);
|
||||||
EvaluateSplitInputs<GradientPair> input{1,
|
EvaluateSplitInputs input{1,0,
|
||||||
parent_sum,
|
parent_sum,
|
||||||
param,
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
|
dh::ToSpan(feature_histogram)};
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
param,
|
||||||
{},
|
{},
|
||||||
dh::ToSpan(feature_segments),
|
dh::ToSpan(feature_segments),
|
||||||
dh::ToSpan(feature_values),
|
dh::ToSpan(feature_values),
|
||||||
dh::ToSpan(feature_min_values),
|
dh::ToSpan(feature_min_values),
|
||||||
dh::ToSpan(feature_histogram)};
|
};
|
||||||
|
|
||||||
GPUHistEvaluator<GradientPair> evaluator(tparam, feature_min_values.size(), 0);
|
GPUHistEvaluator<GradientPairPrecise> evaluator(tparam, feature_min_values.size(), 0);
|
||||||
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0).split;
|
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs, 0).split;
|
||||||
|
|
||||||
EXPECT_EQ(result.findex, 0);
|
EXPECT_EQ(result.findex, 0);
|
||||||
EXPECT_EQ(result.fvalue, 1.0);
|
EXPECT_EQ(result.fvalue, 1.0);
|
||||||
@ -214,37 +222,35 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
std::vector<float>{1.0, 2.0, 11.0, 12.0};
|
std::vector<float>{1.0, 2.0, 11.0, 12.0};
|
||||||
thrust::device_vector<float> feature_min_values =
|
thrust::device_vector<float> feature_min_values =
|
||||||
std::vector<float>{0.0, 0.0};
|
std::vector<float>{0.0, 0.0};
|
||||||
thrust::device_vector<GradientPair> feature_histogram_left =
|
thrust::device_vector<GradientPairPrecise> feature_histogram_left =
|
||||||
std::vector<GradientPair>{
|
std::vector<GradientPairPrecise>{
|
||||||
{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}};
|
{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}};
|
||||||
thrust::device_vector<GradientPair> feature_histogram_right =
|
thrust::device_vector<GradientPairPrecise> feature_histogram_right =
|
||||||
std::vector<GradientPair>{
|
std::vector<GradientPairPrecise>{
|
||||||
{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
|
{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
|
||||||
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
|
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
|
||||||
EvaluateSplitInputs<GradientPair> input_left{
|
EvaluateSplitInputs input_left{
|
||||||
1,
|
1,0,
|
||||||
parent_sum,
|
parent_sum,
|
||||||
param,
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
{},
|
|
||||||
dh::ToSpan(feature_segments),
|
|
||||||
dh::ToSpan(feature_values),
|
|
||||||
dh::ToSpan(feature_min_values),
|
|
||||||
dh::ToSpan(feature_histogram_left)};
|
dh::ToSpan(feature_histogram_left)};
|
||||||
EvaluateSplitInputs<GradientPair> input_right{
|
EvaluateSplitInputs input_right{
|
||||||
2,
|
2,0,
|
||||||
parent_sum,
|
parent_sum,
|
||||||
param,
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
{},
|
|
||||||
dh::ToSpan(feature_segments),
|
|
||||||
dh::ToSpan(feature_values),
|
|
||||||
dh::ToSpan(feature_min_values),
|
|
||||||
dh::ToSpan(feature_histogram_right)};
|
dh::ToSpan(feature_histogram_right)};
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
param,
|
||||||
|
{},
|
||||||
|
dh::ToSpan(feature_segments),
|
||||||
|
dh::ToSpan(feature_values),
|
||||||
|
dh::ToSpan(feature_min_values),
|
||||||
|
};
|
||||||
|
|
||||||
GPUHistEvaluator<GradientPair> evaluator{
|
GPUHistEvaluator<GradientPairPrecise> evaluator{
|
||||||
tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
|
tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
|
||||||
evaluator.EvaluateSplits(input_left, input_right, evaluator.GetEvaluator(),
|
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input_left,input_right};
|
||||||
|
evaluator.LaunchEvaluateSplits(input_left.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator.GetEvaluator(),
|
||||||
dh::ToSpan(out_splits));
|
dh::ToSpan(out_splits));
|
||||||
|
|
||||||
DeviceSplitCandidate result_left = out_splits[0];
|
DeviceSplitCandidate result_left = out_splits[0];
|
||||||
@ -273,16 +279,18 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
|
|||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
dh::device_vector<bst_feature_t> feature_set{std::vector<bst_feature_t>{0}};
|
dh::device_vector<bst_feature_t> feature_set{std::vector<bst_feature_t>{0}};
|
||||||
|
|
||||||
EvaluateSplitInputs<GradientPairPrecise> input{0,
|
EvaluateSplitInputs input{0,0,
|
||||||
total_gpair_,
|
total_gpair_,
|
||||||
GPUTrainingParam{param_},
|
|
||||||
dh::ToSpan(feature_set),
|
dh::ToSpan(feature_set),
|
||||||
|
dh::ToSpan(d_hist)};
|
||||||
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
|
GPUTrainingParam{ param_},
|
||||||
dh::ToSpan(ft),
|
dh::ToSpan(ft),
|
||||||
cuts_.cut_ptrs_.ConstDeviceSpan(),
|
cuts_.cut_ptrs_.ConstDeviceSpan(),
|
||||||
cuts_.cut_values_.ConstDeviceSpan(),
|
cuts_.cut_values_.ConstDeviceSpan(),
|
||||||
cuts_.min_vals_.ConstDeviceSpan(),
|
cuts_.min_vals_.ConstDeviceSpan(),
|
||||||
dh::ToSpan(d_hist)};
|
};
|
||||||
auto split = evaluator.EvaluateSingleSplit(input, 0).split;
|
auto split = evaluator.EvaluateSingleSplit(input, shared_inputs, 0).split;
|
||||||
ASSERT_NEAR(split.loss_chg, best_score_, 1e-16);
|
ASSERT_NEAR(split.loss_chg, best_score_, 1e-16);
|
||||||
}
|
}
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
|
|||||||
@ -285,13 +285,15 @@ class TestDistributedGPU:
|
|||||||
'booster']
|
'booster']
|
||||||
assert hasattr(booster, 'best_score')
|
assert hasattr(booster, 'best_score')
|
||||||
dump = booster.get_dump(dump_format='json')
|
dump = booster.get_dump(dump_format='json')
|
||||||
|
print(booster.best_iteration)
|
||||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||||
|
|
||||||
valid_X = X
|
valid_X = X
|
||||||
valid_y = y
|
valid_y = y
|
||||||
cls = dxgb.DaskXGBClassifier(objective='binary:logistic',
|
cls = dxgb.DaskXGBClassifier(objective='binary:logistic',
|
||||||
tree_method='gpu_hist',
|
tree_method='gpu_hist',
|
||||||
n_estimators=100)
|
eval_metric='error',
|
||||||
|
n_estimators=100)
|
||||||
cls.client = client
|
cls.client = client
|
||||||
cls.fit(X, y, early_stopping_rounds=early_stopping_rounds,
|
cls.fit(X, y, early_stopping_rounds=early_stopping_rounds,
|
||||||
eval_set=[(valid_X, valid_y)])
|
eval_set=[(valid_X, valid_y)])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user