Unify evaluation functions. (#6037)

This commit is contained in:
Jiaming Yuan
2020-08-26 14:23:27 +08:00
committed by GitHub
parent 80c8547147
commit 2fcc4f2886
29 changed files with 570 additions and 734 deletions

View File

@@ -14,9 +14,16 @@ struct ExpandEntry {
int nid;
int depth;
DeviceSplitCandidate split;
float base_weight { std::numeric_limits<float>::quiet_NaN() };
float left_weight { std::numeric_limits<float>::quiet_NaN() };
float right_weight { std::numeric_limits<float>::quiet_NaN() };
ExpandEntry() = default;
XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split)
: nid(nid), depth(depth), split(std::move(split)) {}
XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split,
float base, float left, float right)
: nid(nid), depth(depth), split(std::move(split)), base_weight{base},
left_weight{left}, right_weight{right} {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {

View File

@@ -9,19 +9,20 @@ namespace tree {
// With constraints
template <typename GradientPairT>
XGBOOST_DEVICE float LossChangeMissing(const GradientPairT& scan,
const GradientPairT& missing,
const GradientPairT& parent_sum,
const GPUTrainingParam& param,
int constraint,
const ValueConstraint& value_constraint,
bool& missing_left_out) { // NOLINT
XGBOOST_DEVICE float
LossChangeMissing(const GradientPairT &scan, const GradientPairT &missing,
const GradientPairT &parent_sum,
const GPUTrainingParam &param,
bst_node_t nidx,
bst_feature_t fidx,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
bool &missing_left_out) { // NOLINT
float parent_gain = CalcGain(param, parent_sum);
float missing_left_gain = value_constraint.CalcSplitGain(
param, constraint, GradStats(scan + missing),
GradStats(parent_sum - (scan + missing)));
float missing_right_gain = value_constraint.CalcSplitGain(
param, constraint, GradStats(scan), GradStats(parent_sum - scan));
float missing_left_gain =
evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing),
GradStats(parent_sum - (scan + missing)));
float missing_right_gain = evaluator.CalcSplitGain(
param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan));
if (missing_left_gain >= missing_right_gain) {
missing_left_out = true;
@@ -74,6 +75,7 @@ template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
__device__ void EvaluateFeature(
int fidx, EvaluateSplitInputs<GradientSumT> inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
DeviceSplitCandidate* best_split, // shared memory storing best split
TempStorageT* temp_storage // temp memory for cub operations
) {
@@ -107,8 +109,10 @@ __device__ void EvaluateFeature(
float gain = null_gain;
if (thread_active) {
gain = LossChangeMissing(bin, missing, inputs.parent_sum, inputs.param,
inputs.monotonic_constraints[fidx],
inputs.value_constraint, missing_left);
inputs.nidx,
fidx,
evaluator,
missing_left);
}
__syncthreads();
@@ -148,6 +152,7 @@ template <int BLOCK_THREADS, typename GradientSumT>
__global__ void EvaluateSplitsKernel(
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) {
// KeyValuePair here used as threadIdx.x -> gain_value
using ArgMaxT = cub::KeyValuePair<int, float>;
@@ -183,7 +188,7 @@ __global__ void EvaluateSplitsKernel(
: blockIdx.x - left.feature_set.size()];
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>(
fidx, inputs, &best_split, &temp_storage);
fidx, inputs, evaluator, &best_split, &temp_storage);
__syncthreads();
@@ -200,6 +205,7 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,
template <typename GradientSumT>
void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right) {
size_t combined_num_features =
@@ -209,7 +215,7 @@ void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
// One block for each feature
uint32_t constexpr kBlockThreads = 256;
dh::LaunchKernel {uint32_t(combined_num_features), kBlockThreads, 0}(
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right,
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, evaluator,
dh::ToSpan(feature_best_splits));
// Reduce to get best candidate for left and right child over all features
@@ -240,23 +246,28 @@ void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
template <typename GradientSumT>
void EvaluateSingleSplit(common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> input) {
EvaluateSplits(out_split, input, {});
EvaluateSplits(out_split, evaluator, input, {});
}
template void EvaluateSplits<GradientPair>(
common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPair> left,
EvaluateSplitInputs<GradientPair> right);
template void EvaluateSplits<GradientPairPrecise>(
common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPairPrecise> left,
EvaluateSplitInputs<GradientPairPrecise> right);
template void EvaluateSingleSplit<GradientPair>(
common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPair> input);
template void EvaluateSingleSplit<GradientPairPrecise>(
common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPairPrecise> input);
} // namespace tree
} // namespace xgboost

View File

@@ -5,6 +5,7 @@
#define EVALUATE_SPLITS_CUH_
#include <xgboost/span.h>
#include "../../data/ellpack_page.cuh"
#include "../split_evaluator.h"
#include "../constraints.cuh"
#include "../updater_gpu_common.cuh"
@@ -21,15 +22,15 @@ struct EvaluateSplitInputs {
common::Span<const float> feature_values;
common::Span<const float> min_fvalue;
common::Span<const GradientSumT> gradient_histogram;
ValueConstraint value_constraint;
common::Span<const int> monotonic_constraints;
};
template <typename GradientSumT>
void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right);
template <typename GradientSumT>
void EvaluateSingleSplit(common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> input);
} // namespace tree
} // namespace xgboost

View File

@@ -81,7 +81,7 @@ void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
auto counting = thrust::make_counting_iterator(0llu);
auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
counting, [=] __device__(size_t idx) {
return IndexFlagTuple{idx, position[idx] == left_nidx};
return IndexFlagTuple{idx, static_cast<size_t>(position[idx] == left_nidx)};
});
size_t temp_bytes = 0;
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,

View File

@@ -124,8 +124,7 @@ class RowPartitioner {
dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t),
cudaMemcpyDeviceToHost, streams_[0]));
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]
);
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]);
dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
CHECK_LE(left_count, segment.Size());