Fix different number of features in gpu_hist evaluator. (#8754)
This commit is contained in:
parent
66191e9926
commit
7214a45e83
@ -248,8 +248,10 @@ class EvaluateSplitAgent {
|
|||||||
|
|
||||||
template <int kBlockSize>
|
template <int kBlockSize>
|
||||||
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
||||||
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
|
bst_feature_t max_active_features,
|
||||||
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
const EvaluateSplitSharedInputs shared_inputs,
|
||||||
|
common::Span<bst_feature_t> sorted_idx,
|
||||||
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_candidates) {
|
common::Span<DeviceSplitCandidate> out_candidates) {
|
||||||
// Aligned && shared storage for best_split
|
// Aligned && shared storage for best_split
|
||||||
@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Allocate blocks to one feature of one node
|
// Allocate blocks to one feature of one node
|
||||||
const auto input_idx = blockIdx.x / number_active_features;
|
const auto input_idx = blockIdx.x / max_active_features;
|
||||||
const EvaluateSplitInputs &inputs = d_inputs[input_idx];
|
const EvaluateSplitInputs &inputs = d_inputs[input_idx];
|
||||||
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
||||||
|
// Some blocks may not have any feature to work on, simply return
|
||||||
int fidx = inputs.feature_set[blockIdx.x % number_active_features];
|
int feature_offset = blockIdx.x % max_active_features;
|
||||||
|
if (feature_offset >= inputs.feature_set.size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int fidx = inputs.feature_set[feature_offset];
|
||||||
|
|
||||||
using AgentT = EvaluateSplitAgent<kBlockSize>;
|
using AgentT = EvaluateSplitAgent<kBlockSize>;
|
||||||
__shared__ typename AgentT::TempStorage temp_storage;
|
__shared__ typename AgentT::TempStorage temp_storage;
|
||||||
@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GPUHistEvaluator::LaunchEvaluateSplits(
|
void GPUHistEvaluator::LaunchEvaluateSplits(
|
||||||
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
|
bst_feature_t max_active_features,
|
||||||
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
EvaluateSplitSharedInputs shared_inputs,
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_splits) {
|
common::Span<DeviceSplitCandidate> out_splits) {
|
||||||
@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
|
|||||||
this->SortHistogram(d_inputs, shared_inputs, evaluator);
|
this->SortHistogram(d_inputs, shared_inputs, evaluator);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t combined_num_features = number_active_features * d_inputs.size();
|
size_t combined_num_features = max_active_features * d_inputs.size();
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
|
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
|
||||||
|
combined_num_features, DeviceSplitCandidate());
|
||||||
|
|
||||||
// One block for each feature
|
// One block for each feature
|
||||||
uint32_t constexpr kBlockThreads = 32;
|
uint32_t constexpr kBlockThreads = 32;
|
||||||
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
|
dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
|
||||||
EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs,
|
0}(
|
||||||
shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
|
EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
|
||||||
|
shared_inputs,
|
||||||
|
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
|
||||||
evaluator, dh::ToSpan(feature_best_splits));
|
evaluator, dh::ToSpan(feature_best_splits));
|
||||||
|
|
||||||
// Reduce to get best candidate for left and right child over all features
|
// Reduce to get best candidate for left and right child over all features
|
||||||
auto reduce_offset = dh::MakeTransformIterator<size_t>(
|
auto reduce_offset =
|
||||||
thrust::make_counting_iterator(0llu),
|
dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
|
||||||
[=] __device__(size_t idx) -> size_t { return idx * number_active_features; });
|
[=] __device__(size_t idx) -> size_t {
|
||||||
|
return idx * max_active_features;
|
||||||
|
});
|
||||||
size_t temp_storage_bytes = 0;
|
size_t temp_storage_bytes = 0;
|
||||||
auto num_segments = out_splits.size();
|
auto num_segments = out_splits.size();
|
||||||
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
||||||
@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GPUHistEvaluator::EvaluateSplits(
|
void GPUHistEvaluator::EvaluateSplits(
|
||||||
const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features,
|
const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
|
||||||
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
common::Span<GPUExpandEntry> out_entries) {
|
common::Span<GPUExpandEntry> out_entries) {
|
||||||
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
||||||
|
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
|
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
|
||||||
auto out_splits = dh::ToSpan(splits_out_storage);
|
auto out_splits = dh::ToSpan(splits_out_storage);
|
||||||
this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator,
|
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
|
||||||
out_splits);
|
evaluator, out_splits);
|
||||||
|
|
||||||
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||||
auto d_entries = out_entries;
|
auto d_entries = out_entries;
|
||||||
|
|||||||
@ -170,13 +170,18 @@ class GPUHistEvaluator {
|
|||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
||||||
|
|
||||||
// impl of evaluate splits, contains CUDA kernels so it's public
|
// impl of evaluate splits, contains CUDA kernels so it's public
|
||||||
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
|
void LaunchEvaluateSplits(
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
bst_feature_t max_active_features,
|
||||||
common::Span<DeviceSplitCandidate> out_splits);
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
|
common::Span<DeviceSplitCandidate> out_splits);
|
||||||
/**
|
/**
|
||||||
* \brief Evaluate splits for left and right nodes.
|
* \brief Evaluate splits for left and right nodes.
|
||||||
*/
|
*/
|
||||||
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
|
void EvaluateSplits(const std::vector<bst_node_t> &nidx,
|
||||||
|
bst_feature_t max_active_features,
|
||||||
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
EvaluateSplitSharedInputs shared_inputs,
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
common::Span<GPUExpandEntry> out_splits);
|
common::Span<GPUExpandEntry> out_splits);
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -319,24 +319,27 @@ struct GPUHistMakerDevice {
|
|||||||
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||||
right_sampled_features->SetDevice(ctx_->gpu_id);
|
right_sampled_features->SetDevice(ctx_->gpu_id);
|
||||||
common::Span<bst_feature_t> right_feature_set =
|
common::Span<bst_feature_t> right_feature_set =
|
||||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
|
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
|
||||||
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
|
right_nidx);
|
||||||
left_feature_set, hist.GetNodeHistogram(left_nidx)};
|
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
|
||||||
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
|
candidate.split.left_sum, left_feature_set,
|
||||||
right_feature_set, hist.GetNodeHistogram(right_nidx)};
|
hist.GetNodeHistogram(left_nidx)};
|
||||||
|
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
|
||||||
|
candidate.split.right_sum, right_feature_set,
|
||||||
|
hist.GetNodeHistogram(right_nidx)};
|
||||||
}
|
}
|
||||||
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
|
bst_feature_t max_active_features = 0;
|
||||||
for (auto input : h_node_inputs) {
|
for (auto input : h_node_inputs) {
|
||||||
CHECK_EQ(input.feature_set.size(), number_active_features)
|
max_active_features = std::max(max_active_features,
|
||||||
<< "Current implementation assumes that the number of active features "
|
bst_feature_t(input.feature_set.size()));
|
||||||
"(after sampling) in any node is the same";
|
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs),
|
d_node_inputs.data().get(), h_node_inputs.data(),
|
||||||
cudaMemcpyDefault));
|
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
|
||||||
|
|
||||||
this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
|
this->evaluator_.EvaluateSplits(nidx, max_active_features,
|
||||||
shared_inputs, dh::ToSpan(entries));
|
dh::ToSpan(d_node_inputs), shared_inputs,
|
||||||
|
dh::ToSpan(entries));
|
||||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
||||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
sys.path.append("tests/python")
|
sys.path.append("tests/python")
|
||||||
# Don't import the test class, otherwise they will run twice.
|
# Don't import the test class, otherwise they will run twice.
|
||||||
@ -13,7 +16,34 @@ class TestGPUInteractionConstraints:
|
|||||||
cputest = test_ic.TestInteractionConstraints()
|
cputest = test_ic.TestInteractionConstraints()
|
||||||
|
|
||||||
def test_interaction_constraints(self):
|
def test_interaction_constraints(self):
|
||||||
self.cputest.run_interaction_constraints(tree_method='gpu_hist')
|
self.cputest.run_interaction_constraints(tree_method="gpu_hist")
|
||||||
|
|
||||||
def test_training_accuracy(self):
|
def test_training_accuracy(self):
|
||||||
self.cputest.training_accuracy(tree_method='gpu_hist')
|
self.cputest.training_accuracy(tree_method="gpu_hist")
|
||||||
|
|
||||||
|
# case where different number of features can occur in the evaluator
|
||||||
|
def test_issue_8730(self):
|
||||||
|
X = pd.DataFrame(
|
||||||
|
zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
|
||||||
|
columns=["A", "B", "C", "D"],
|
||||||
|
)
|
||||||
|
y = np.array([*([0] * 50), *([1] * 50)])
|
||||||
|
dm = xgb.DMatrix(X, label=y)
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"eta": 0.16095019509249486,
|
||||||
|
"min_child_weight": 1,
|
||||||
|
"subsample": 0.688567929338029,
|
||||||
|
"colsample_bynode": 0.7,
|
||||||
|
"gamma": 5.666579817418348e-06,
|
||||||
|
"lambda": 0.14943712232059794,
|
||||||
|
"grow_policy": "depthwise",
|
||||||
|
"max_depth": 3,
|
||||||
|
"tree_method": "gpu_hist",
|
||||||
|
"interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
|
||||||
|
"objective": "count:poisson",
|
||||||
|
"eval_metric": "poisson-nloglik",
|
||||||
|
"verbosity": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
xgb.train(params, dm, num_boost_round=100)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user