From df984f9c433061ebf3e97e0e3bd4b15c5577c9c8 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 9 Feb 2023 18:31:49 +0800 Subject: [PATCH] [backport] Fix different number of features in gpu_hist evaluator. (#8754) (#8769) Co-authored-by: Rory Mitchell --- src/tree/gpu_hist/evaluate_splits.cu | 49 ++++++++++++------- src/tree/gpu_hist/evaluate_splits.cuh | 13 +++-- src/tree/updater_gpu_hist.cu | 31 ++++++------ .../test_gpu_interaction_constraints.py | 39 +++++++++++++-- 4 files changed, 93 insertions(+), 39 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index e471c8d36..781fff92a 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -248,8 +248,10 @@ class EvaluateSplitAgent { template __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( - bst_feature_t number_active_features, common::Span d_inputs, - const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, + bst_feature_t max_active_features, + common::Span d_inputs, + const EvaluateSplitSharedInputs shared_inputs, + common::Span sorted_idx, const TreeEvaluator::SplitEvaluator evaluator, common::Span out_candidates) { // Aligned && shared storage for best_split @@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( __syncthreads(); // Allocate blocks to one feature of one node - const auto input_idx = blockIdx.x / number_active_features; + const auto input_idx = blockIdx.x / max_active_features; const EvaluateSplitInputs &inputs = d_inputs[input_idx]; // One block for each feature. Features are sampled, so fidx != blockIdx.x - - int fidx = inputs.feature_set[blockIdx.x % number_active_features]; + // Some blocks may not have any feature to work on, simply return + int feature_offset = blockIdx.x % max_active_features; + if (feature_offset >= inputs.feature_set.size()) { + return; + } + int fidx = inputs.feature_set[feature_offset]; using AgentT = EvaluateSplitAgent; __shared__ typename AgentT::TempStorage temp_storage; @@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu } void GPUHistEvaluator::LaunchEvaluateSplits( - bst_feature_t number_active_features, common::Span d_inputs, + bst_feature_t max_active_features, + common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { @@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits( this->SortHistogram(d_inputs, shared_inputs, evaluator); } - size_t combined_num_features = number_active_features * d_inputs.size(); - dh::TemporaryArray feature_best_splits(combined_num_features); + size_t combined_num_features = max_active_features * d_inputs.size(); + dh::TemporaryArray feature_best_splits( + combined_num_features, DeviceSplitCandidate()); // One block for each feature uint32_t constexpr kBlockThreads = 32; - dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features, d_inputs, - shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), + dh::LaunchKernel{static_cast(combined_num_features), kBlockThreads, + 0}( + EvaluateSplitsKernel, max_active_features, d_inputs, + shared_inputs, + this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features - auto reduce_offset = dh::MakeTransformIterator( - thrust::make_counting_iterator(0llu), - [=] __device__(size_t idx) -> size_t { return idx * number_active_features; }); + auto reduce_offset = + dh::MakeTransformIterator(thrust::make_counting_iterator(0llu), + [=] __device__(size_t idx) -> size_t { + return idx * max_active_features; + }); size_t temp_storage_bytes = 0; auto num_segments = out_splits.size(); cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(), @@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { } void GPUHistEvaluator::EvaluateSplits( - const std::vector &nidx, bst_feature_t number_active_features, - common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, + const std::vector &nidx, bst_feature_t max_active_features, + common::Span d_inputs, + EvaluateSplitSharedInputs shared_inputs, common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); dh::TemporaryArray splits_out_storage(d_inputs.size()); auto out_splits = dh::ToSpan(splits_out_storage); - this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator, - out_splits); + this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs, + evaluator, out_splits); auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); auto d_entries = out_entries; diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index d3174c4df..0b44f31aa 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -170,13 +170,18 @@ class GPUHistEvaluator { TreeEvaluator::SplitEvaluator evaluator); // impl of evaluate splits, contains CUDA kernels so it's public - void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span d_inputs,EvaluateSplitSharedInputs shared_inputs, - TreeEvaluator::SplitEvaluator evaluator, - common::Span out_splits); + void LaunchEvaluateSplits( + bst_feature_t max_active_features, + common::Span d_inputs, + EvaluateSplitSharedInputs shared_inputs, + TreeEvaluator::SplitEvaluator evaluator, + common::Span out_splits); /** * \brief Evaluate splits for left and right nodes. */ - void EvaluateSplits(const std::vector &nidx,bst_feature_t number_active_features,common::Span d_inputs, + void EvaluateSplits(const std::vector &nidx, + bst_feature_t max_active_features, + common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); /** diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index b90a7ce09..3b01d7437 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -318,24 +318,27 @@ struct GPUHistMakerDevice { auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx)); right_sampled_features->SetDevice(ctx_->gpu_id); common::Span right_feature_set = - interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); - h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum, - left_feature_set, hist.GetNodeHistogram(left_nidx)}; - h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum, - right_feature_set, hist.GetNodeHistogram(right_nidx)}; + interaction_constraints.Query(right_sampled_features->DeviceSpan(), + right_nidx); + h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, + candidate.split.left_sum, left_feature_set, + hist.GetNodeHistogram(left_nidx)}; + h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, + candidate.split.right_sum, right_feature_set, + hist.GetNodeHistogram(right_nidx)}; } - bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); + bst_feature_t max_active_features = 0; for (auto input : h_node_inputs) { - CHECK_EQ(input.feature_set.size(), number_active_features) - << "Current implementation assumes that the number of active features " - "(after sampling) in any node is the same"; + max_active_features = std::max(max_active_features, + bst_feature_t(input.feature_set.size())); } - dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), - h_node_inputs.size() * sizeof(EvaluateSplitInputs), - cudaMemcpyDefault)); + dh::safe_cuda(cudaMemcpyAsync( + d_node_inputs.data().get(), h_node_inputs.data(), + h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); - this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs), - shared_inputs, dh::ToSpan(entries)); + this->evaluator_.EvaluateSplits(nidx, max_active_features, + dh::ToSpan(d_node_inputs), shared_inputs, + dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); diff --git a/tests/python-gpu/test_gpu_interaction_constraints.py b/tests/python-gpu/test_gpu_interaction_constraints.py index 885cf5bf9..434cc15da 100644 --- a/tests/python-gpu/test_gpu_interaction_constraints.py +++ b/tests/python-gpu/test_gpu_interaction_constraints.py @@ -1,8 +1,14 @@ -import numpy as np import sys + +import numpy as np +import pandas as pd + +import xgboost as xgb + sys.path.append("tests/python") # Don't import the test class, otherwise they will run twice. import test_interaction_constraints as test_ic # noqa + rng = np.random.RandomState(1994) @@ -10,7 +16,34 @@ class TestGPUInteractionConstraints: cputest = test_ic.TestInteractionConstraints() def test_interaction_constraints(self): - self.cputest.run_interaction_constraints(tree_method='gpu_hist') + self.cputest.run_interaction_constraints(tree_method="gpu_hist") def test_training_accuracy(self): - self.cputest.training_accuracy(tree_method='gpu_hist') + self.cputest.training_accuracy(tree_method="gpu_hist") + + # case where different number of features can occur in the evaluator + def test_issue_8730(self): + X = pd.DataFrame( + zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)), + columns=["A", "B", "C", "D"], + ) + y = np.array([*([0] * 50), *([1] * 50)]) + dm = xgb.DMatrix(X, label=y) + + params = { + "eta": 0.16095019509249486, + "min_child_weight": 1, + "subsample": 0.688567929338029, + "colsample_bynode": 0.7, + "gamma": 5.666579817418348e-06, + "lambda": 0.14943712232059794, + "grow_policy": "depthwise", + "max_depth": 3, + "tree_method": "gpu_hist", + "interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]], + "objective": "count:poisson", + "eval_metric": "poisson-nloglik", + "verbosity": 0, + } + + xgb.train(params, dm, num_boost_round=100)