[backport] Fix different number of features in gpu_hist evaluator. (#8754) (#8769)

Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>
This commit is contained in:
Jiaming Yuan 2023-02-09 18:31:49 +08:00 committed by GitHub
parent 2f22f8d49b
commit df984f9c43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 93 additions and 39 deletions

View File

@ -248,8 +248,10 @@ class EvaluateSplitAgent {
template <int kBlockSize> template <int kBlockSize>
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs, bst_feature_t max_active_features,
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx, common::Span<const EvaluateSplitInputs> d_inputs,
const EvaluateSplitSharedInputs shared_inputs,
common::Span<bst_feature_t> sorted_idx,
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator, const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) { common::Span<DeviceSplitCandidate> out_candidates) {
// Aligned && shared storage for best_split // Aligned && shared storage for best_split
@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
__syncthreads(); __syncthreads();
// Allocate blocks to one feature of one node // Allocate blocks to one feature of one node
const auto input_idx = blockIdx.x / number_active_features; const auto input_idx = blockIdx.x / max_active_features;
const EvaluateSplitInputs &inputs = d_inputs[input_idx]; const EvaluateSplitInputs &inputs = d_inputs[input_idx];
// One block for each feature. Features are sampled, so fidx != blockIdx.x // One block for each feature. Features are sampled, so fidx != blockIdx.x
// Some blocks may not have any feature to work on, simply return
int fidx = inputs.feature_set[blockIdx.x % number_active_features]; int feature_offset = blockIdx.x % max_active_features;
if (feature_offset >= inputs.feature_set.size()) {
return;
}
int fidx = inputs.feature_set[feature_offset];
using AgentT = EvaluateSplitAgent<kBlockSize>; using AgentT = EvaluateSplitAgent<kBlockSize>;
__shared__ typename AgentT::TempStorage temp_storage; __shared__ typename AgentT::TempStorage temp_storage;
@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
} }
void GPUHistEvaluator::LaunchEvaluateSplits( void GPUHistEvaluator::LaunchEvaluateSplits(
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs, EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator, TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits) { common::Span<DeviceSplitCandidate> out_splits) {
@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
this->SortHistogram(d_inputs, shared_inputs, evaluator); this->SortHistogram(d_inputs, shared_inputs, evaluator);
} }
size_t combined_num_features = number_active_features * d_inputs.size(); size_t combined_num_features = max_active_features * d_inputs.size();
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features); dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
combined_num_features, DeviceSplitCandidate());
// One block for each feature // One block for each feature
uint32_t constexpr kBlockThreads = 32; uint32_t constexpr kBlockThreads = 32;
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}( dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs, 0}(
shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
shared_inputs,
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
evaluator, dh::ToSpan(feature_best_splits)); evaluator, dh::ToSpan(feature_best_splits));
// Reduce to get best candidate for left and right child over all features // Reduce to get best candidate for left and right child over all features
auto reduce_offset = dh::MakeTransformIterator<size_t>( auto reduce_offset =
thrust::make_counting_iterator(0llu), dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) -> size_t { return idx * number_active_features; }); [=] __device__(size_t idx) -> size_t {
return idx * max_active_features;
});
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
auto num_segments = out_splits.size(); auto num_segments = out_splits.size();
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(), cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
} }
void GPUHistEvaluator::EvaluateSplits( void GPUHistEvaluator::EvaluateSplits(
const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features, const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) { common::Span<GPUExpandEntry> out_entries) {
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>(); auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size()); dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
auto out_splits = dh::ToSpan(splits_out_storage); auto out_splits = dh::ToSpan(splits_out_storage);
this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator, this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
out_splits); evaluator, out_splits);
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
auto d_entries = out_entries; auto d_entries = out_entries;

View File

@ -170,13 +170,18 @@ class GPUHistEvaluator {
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator); TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
// impl of evaluate splits, contains CUDA kernels so it's public // impl of evaluate splits, contains CUDA kernels so it's public
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs, void LaunchEvaluateSplits(
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator, TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits); common::Span<DeviceSplitCandidate> out_splits);
/** /**
* \brief Evaluate splits for left and right nodes. * \brief Evaluate splits for left and right nodes.
*/ */
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs, void EvaluateSplits(const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs, EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_splits); common::Span<GPUExpandEntry> out_splits);
/** /**

View File

@ -318,24 +318,27 @@ struct GPUHistMakerDevice {
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx)); auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id); right_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> right_feature_set = common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); interaction_constraints.Query(right_sampled_features->DeviceSpan(),
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum, right_nidx);
left_feature_set, hist.GetNodeHistogram(left_nidx)}; h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum, candidate.split.left_sum, left_feature_set,
right_feature_set, hist.GetNodeHistogram(right_nidx)}; hist.GetNodeHistogram(left_nidx)};
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
candidate.split.right_sum, right_feature_set,
hist.GetNodeHistogram(right_nidx)};
} }
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); bst_feature_t max_active_features = 0;
for (auto input : h_node_inputs) { for (auto input : h_node_inputs) {
CHECK_EQ(input.feature_set.size(), number_active_features) max_active_features = std::max(max_active_features,
<< "Current implementation assumes that the number of active features " bst_feature_t(input.feature_set.size()));
"(after sampling) in any node is the same";
} }
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), dh::safe_cuda(cudaMemcpyAsync(
h_node_inputs.size() * sizeof(EvaluateSplitInputs), d_node_inputs.data().get(), h_node_inputs.data(),
cudaMemcpyDefault)); h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs), this->evaluator_.EvaluateSplits(nidx, max_active_features,
shared_inputs, dh::ToSpan(entries)); dh::ToSpan(d_node_inputs), shared_inputs,
dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));

View File

@ -1,8 +1,14 @@
import numpy as np
import sys import sys
import numpy as np
import pandas as pd
import xgboost as xgb
sys.path.append("tests/python") sys.path.append("tests/python")
# Don't import the test class, otherwise they will run twice. # Don't import the test class, otherwise they will run twice.
import test_interaction_constraints as test_ic # noqa import test_interaction_constraints as test_ic # noqa
rng = np.random.RandomState(1994) rng = np.random.RandomState(1994)
@ -10,7 +16,34 @@ class TestGPUInteractionConstraints:
cputest = test_ic.TestInteractionConstraints() cputest = test_ic.TestInteractionConstraints()
def test_interaction_constraints(self): def test_interaction_constraints(self):
self.cputest.run_interaction_constraints(tree_method='gpu_hist') self.cputest.run_interaction_constraints(tree_method="gpu_hist")
def test_training_accuracy(self): def test_training_accuracy(self):
self.cputest.training_accuracy(tree_method='gpu_hist') self.cputest.training_accuracy(tree_method="gpu_hist")
# case where different number of features can occur in the evaluator
def test_issue_8730(self):
X = pd.DataFrame(
zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
columns=["A", "B", "C", "D"],
)
y = np.array([*([0] * 50), *([1] * 50)])
dm = xgb.DMatrix(X, label=y)
params = {
"eta": 0.16095019509249486,
"min_child_weight": 1,
"subsample": 0.688567929338029,
"colsample_bynode": 0.7,
"gamma": 5.666579817418348e-06,
"lambda": 0.14943712232059794,
"grow_policy": "depthwise",
"max_depth": 3,
"tree_method": "gpu_hist",
"interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
"objective": "count:poisson",
"eval_metric": "poisson-nloglik",
"verbosity": 0,
}
xgb.train(params, dm, num_boost_round=100)