Add max_cat_threshold to GPU and handle missing cat values. (#8212)
This commit is contained in:
@@ -43,9 +43,9 @@ class EvaluateSplitAgent {
|
||||
public:
|
||||
using ArgMaxT = cub::KeyValuePair<int, float>;
|
||||
using BlockScanT = cub::BlockScan<GradientPairPrecise, kBlockSize>;
|
||||
using MaxReduceT =
|
||||
cub::WarpReduce<ArgMaxT>;
|
||||
using MaxReduceT = cub::WarpReduce<ArgMaxT>;
|
||||
using SumReduceT = cub::WarpReduce<GradientPairPrecise>;
|
||||
|
||||
struct TempStorage {
|
||||
typename BlockScanT::TempStorage scan;
|
||||
typename MaxReduceT::TempStorage max_reduce;
|
||||
@@ -159,49 +159,81 @@ class EvaluateSplitAgent {
|
||||
if (threadIdx.x == best_thread) {
|
||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||
float fvalue = feature_values[split_gidx];
|
||||
GradientPairPrecise left =
|
||||
missing_left ? bin + missing : bin;
|
||||
GradientPairPrecise left = missing_left ? bin + missing : bin;
|
||||
GradientPairPrecise right = parent_sum - left;
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
|
||||
true, param);
|
||||
best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir,
|
||||
static_cast<bst_cat_t>(fvalue), fidx, left, right, param);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* \brief Gather and update the best split.
|
||||
*/
|
||||
__device__ __forceinline__ void PartitionUpdate(bst_bin_t scan_begin, bool thread_active,
|
||||
bool missing_left, bst_bin_t it,
|
||||
GradientPairPrecise const &left_sum,
|
||||
GradientPairPrecise const &right_sum,
|
||||
DeviceSplitCandidate *__restrict__ best_split) {
|
||||
auto gain =
|
||||
thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) : kNullGain;
|
||||
|
||||
// Find thread with best gain
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
||||
// This reduce result is only valid in thread 0
|
||||
// broadcast to the rest of the warp
|
||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||
// Best thread updates the split
|
||||
if (threadIdx.x == best_thread) {
|
||||
assert(thread_active);
|
||||
// index of best threshold inside a feature.
|
||||
auto best_thresh = it - gidx_begin;
|
||||
best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left_sum,
|
||||
right_sum, param);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* \brief Partition-based split for categorical feature.
|
||||
*/
|
||||
__device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
|
||||
bst_feature_t * __restrict__ sorted_idx,
|
||||
std::size_t offset) {
|
||||
for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
|
||||
bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
|
||||
common::Span<bst_feature_t> sorted_idx,
|
||||
std::size_t node_offset,
|
||||
GPUTrainingParam const ¶m) {
|
||||
bst_bin_t n_bins_feature = gidx_end - gidx_begin;
|
||||
auto n_bins = std::min(param.max_cat_threshold, n_bins_feature);
|
||||
|
||||
auto rest = thread_active
|
||||
? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset)
|
||||
: GradientPairPrecise();
|
||||
bst_bin_t it_begin = gidx_begin;
|
||||
bst_bin_t it_end = it_begin + n_bins - 1;
|
||||
|
||||
// forward
|
||||
for (bst_bin_t scan_begin = it_begin; scan_begin < it_end; scan_begin += kBlockSize) {
|
||||
auto it = scan_begin + static_cast<bst_bin_t>(threadIdx.x);
|
||||
bool thread_active = it < it_end;
|
||||
|
||||
auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - node_offset)
|
||||
: GradientPairPrecise();
|
||||
// No min value for cat feature, use inclusive scan.
|
||||
BlockScanT(temp_storage->scan).InclusiveSum(rest, rest, prefix_op);
|
||||
GradientPairPrecise bin = parent_sum - rest - missing;
|
||||
BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op);
|
||||
GradientPairPrecise left_sum = parent_sum - right_sum;
|
||||
|
||||
// Whether the gradient of missing values is put to the left side.
|
||||
bool missing_left = true;
|
||||
float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx,
|
||||
evaluator, missing_left)
|
||||
: kNullGain;
|
||||
PartitionUpdate(scan_begin, thread_active, true, it, left_sum, right_sum, best_split);
|
||||
}
|
||||
|
||||
// backward
|
||||
it_begin = gidx_end - 1;
|
||||
it_end = it_begin - n_bins + 1;
|
||||
prefix_op = SumCallbackOp<GradientPairPrecise>{}; // reset
|
||||
|
||||
// Find thread with best gain
|
||||
auto best =
|
||||
MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
||||
// This reduce result is only valid in thread 0
|
||||
// broadcast to the rest of the warp
|
||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||
// Best thread updates the split
|
||||
if (threadIdx.x == best_thread) {
|
||||
GradientPairPrecise left = missing_left ? bin + missing : bin;
|
||||
GradientPairPrecise right = parent_sum - left;
|
||||
auto best_thresh =
|
||||
threadIdx.x + (scan_begin - gidx_begin); // index of best threshold inside a feature.
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left,
|
||||
right, true, param);
|
||||
}
|
||||
for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) {
|
||||
auto it = scan_begin - static_cast<bst_bin_t>(threadIdx.x);
|
||||
bool thread_active = it > it_end;
|
||||
|
||||
auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - node_offset)
|
||||
: GradientPairPrecise();
|
||||
// No min value for cat feature, use inclusive scan.
|
||||
BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, prefix_op);
|
||||
GradientPairPrecise right_sum = parent_sum - left_sum;
|
||||
|
||||
PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -242,7 +274,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
||||
auto total_bins = shared_inputs.feature_values.size();
|
||||
size_t offset = total_bins * input_idx;
|
||||
auto node_sorted_idx = sorted_idx.subspan(offset, total_bins);
|
||||
agent.Partition(&best_split, node_sorted_idx.data(), offset);
|
||||
agent.Partition(&best_split, node_sorted_idx, offset, shared_inputs.param);
|
||||
}
|
||||
} else {
|
||||
agent.Numerical(&best_split);
|
||||
@@ -273,36 +305,28 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
|
||||
|
||||
// Simple case for one hot split
|
||||
if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
|
||||
out_split.split_cats.Set(common::AsCat(out_split.fvalue));
|
||||
out_split.split_cats.Set(common::AsCat(out_split.thresh));
|
||||
return;
|
||||
}
|
||||
|
||||
// partition-based split
|
||||
auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx,
|
||||
shared_inputs.feature_values.size());
|
||||
size_t node_offset = input_idx * shared_inputs.feature_values.size();
|
||||
auto best_thresh = out_split.PopBestThresh();
|
||||
auto const best_thresh = out_split.thresh;
|
||||
if (best_thresh == -1) {
|
||||
return;
|
||||
}
|
||||
auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx],
|
||||
shared_inputs.FeatureBins(fidx));
|
||||
if (out_split.dir != kLeftDir) {
|
||||
// forward, missing on right
|
||||
auto beg = dh::tcbegin(f_sorted_idx);
|
||||
// Don't put all the categories into one side
|
||||
auto boundary = std::min(static_cast<size_t>((best_thresh + 1)), (f_sorted_idx.size() - 1));
|
||||
boundary = std::max(boundary, static_cast<size_t>(1ul));
|
||||
auto end = beg + boundary;
|
||||
thrust::for_each(thrust::seq, beg, end, [&](auto c) {
|
||||
auto cat = shared_inputs.feature_values[c - node_offset];
|
||||
assert(!out_split.split_cats.Check(cat) && "already set");
|
||||
out_split.SetCat(cat);
|
||||
});
|
||||
} else {
|
||||
assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
|
||||
thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
|
||||
dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
|
||||
auto cat = shared_inputs.feature_values[c - node_offset];
|
||||
out_split.SetCat(cat);
|
||||
});
|
||||
}
|
||||
bool forward = out_split.dir == kLeftDir;
|
||||
bst_bin_t partition = forward ? best_thresh + 1 : best_thresh;
|
||||
auto beg = dh::tcbegin(f_sorted_idx);
|
||||
assert(partition > 0 && "Invalid partition.");
|
||||
thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
|
||||
auto cat = shared_inputs.feature_values[c - node_offset];
|
||||
out_split.SetCat(cat);
|
||||
});
|
||||
}
|
||||
|
||||
void GPUHistEvaluator::LaunchEvaluateSplits(
|
||||
|
||||
@@ -141,7 +141,8 @@ class GPUHistEvaluator {
|
||||
*/
|
||||
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
|
||||
copy_stream_.View().Sync();
|
||||
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
|
||||
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
|
||||
nidx * node_categorical_storage_size_, node_categorical_storage_size_);
|
||||
return cats_out;
|
||||
}
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user