Add categorical data support to GPU Hist. (#6164)

This commit is contained in:
Jiaming Yuan
2020-09-29 11:27:25 +08:00
committed by GitHub
parent 798af22ff4
commit 444131a2e6
9 changed files with 306 additions and 103 deletions

View File

@@ -15,7 +15,7 @@ auto ZeroParam() {
}
} // anonymous namespace
TEST(GpuHist, EvaluateSingleSplit) {
void TestEvaluateSingleSplit(bool is_categorical) {
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
GradientPair parent_sum(0.0, 1.0);
TrainParam tparam = ZeroParam();
@@ -33,11 +33,19 @@ TEST(GpuHist, EvaluateSingleSplit) {
thrust::device_vector<GradientPair> feature_histogram =
std::vector<GradientPair>{
{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}};
thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
dh::device_vector<FeatureType> feature_types(feature_set.size(),
FeatureType::kCategorical);
common::Span<FeatureType> d_feature_types;
if (is_categorical) {
d_feature_types = dh::ToSpan(feature_types);
}
EvaluateSplitInputs<GradientPair> input{1,
parent_sum,
param,
dh::ToSpan(feature_set),
d_feature_types,
dh::ToSpan(feature_segments),
dh::ToSpan(feature_values),
dh::ToSpan(feature_min_values),
@@ -55,6 +63,14 @@ TEST(GpuHist, EvaluateSingleSplit) {
parent_sum.GetHess());
}
TEST(GpuHist, EvaluateSingleSplit) {
TestEvaluateSingleSplit(false);
}
TEST(GpuHist, EvaluateCategoricalSplit) {
TestEvaluateSingleSplit(true);
}
TEST(GpuHist, EvaluateSingleSplitMissing) {
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
GradientPair parent_sum(1.0, 1.5);
@@ -74,6 +90,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
parent_sum,
param,
dh::ToSpan(feature_set),
{},
dh::ToSpan(feature_segments),
dh::ToSpan(feature_values),
dh::ToSpan(feature_min_values),
@@ -134,6 +151,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
parent_sum,
param,
dh::ToSpan(feature_set),
{},
dh::ToSpan(feature_segments),
dh::ToSpan(feature_values),
dh::ToSpan(feature_min_values),
@@ -174,6 +192,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
parent_sum,
param,
dh::ToSpan(feature_set),
{},
dh::ToSpan(feature_segments),
dh::ToSpan(feature_values),
dh::ToSpan(feature_min_values),
@@ -215,6 +234,7 @@ TEST(GpuHist, EvaluateSplits) {
parent_sum,
param,
dh::ToSpan(feature_set),
{},
dh::ToSpan(feature_segments),
dh::ToSpan(feature_values),
dh::ToSpan(feature_min_values),
@@ -224,6 +244,7 @@ TEST(GpuHist, EvaluateSplits) {
parent_sum,
param,
dh::ToSpan(feature_set),
{},
dh::ToSpan(feature_segments),
dh::ToSpan(feature_values),
dh::ToSpan(feature_min_values),
@@ -241,6 +262,5 @@ TEST(GpuHist, EvaluateSplits) {
EXPECT_EQ(result_right.findex, 0);
EXPECT_EQ(result_right.fvalue, 1.0);
}
} // namespace tree
} // namespace xgboost

View File

@@ -80,7 +80,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
param.Init(args);
auto page = BuildEllpackPage(kNRows, kNCols);
BatchParam batch_param{};
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), kNRows, param, kNCols, kNCols,
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), {}, kNRows, param, kNCols, kNCols,
true, batch_param);
xgboost::SimpleLCG gen;
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
@@ -130,6 +130,48 @@ TEST(GpuHist, BuildHistSharedMem) {
TestBuildHist<GradientPair>(true);
}
TEST(GpuHist, ApplySplit) {
RegTree tree;
ExpandEntry candidate;
candidate.nid = 0;
candidate.left_weight = 1.0f;
candidate.right_weight = 2.0f;
candidate.base_weight = 3.0f;
candidate.split.is_cat = true;
candidate.split.fvalue = 1.0f; // at cat 1
size_t n_rows = 10;
size_t n_cols = 10;
auto m = RandomDataGenerator{n_rows, n_cols, 0}.GenerateDMatrix(true);
GenericParameter p;
p.InitAllowUnknown(Args{});
TrainParam tparam;
tparam.InitAllowUnknown(Args{});
BatchParam bparam;
bparam.gpu_id = 0;
bparam.max_bin = 3;
bparam.gpu_page_size = 0;
for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
auto impl = ellpack.Impl();
HostDeviceVector<FeatureType> feature_types(10, FeatureType::kCategorical);
feature_types.SetDevice(bparam.gpu_id);
tree::GPUHistMakerDevice<GradientPairPrecise> updater(
0, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols, true, bparam);
updater.ApplySplit(candidate, &tree);
ASSERT_EQ(tree.GetSplitTypes().size(), 3);
ASSERT_EQ(tree.GetSplitTypes()[0], FeatureType::kCategorical);
ASSERT_EQ(tree.GetSplitCategories().size(), 1);
uint32_t bits = 1u << 30; // bits: 0, 1, 0, 0, 0, ..., 0
ASSERT_EQ(tree.GetSplitCategories().back(), bits);
ASSERT_EQ(updater.node_categories.size(), 1);
}
}
HistogramCutsWrapper GetHostCutMatrix () {
HistogramCutsWrapper cmat;
cmat.SetPtrs({0, 3, 6, 9, 12, 15, 18, 21, 24});
@@ -154,19 +196,18 @@ TEST(GpuHist, EvaluateRootSplit) {
TrainParam param;
std::vector<std::pair<std::string, std::string>> args {
{"max_depth", "1"},
{"max_leaves", "0"},
std::vector<std::pair<std::string, std::string>> args{
{"max_depth", "1"},
{"max_leaves", "0"},
// Disable all other parameters.
{"colsample_bynode", "1"},
{"colsample_bylevel", "1"},
{"colsample_bytree", "1"},
{"min_child_weight", "0.01"},
{"reg_alpha", "0"},
{"reg_lambda", "0"},
{"max_delta_step", "0"}
};
// Disable all other parameters.
{"colsample_bynode", "1"},
{"colsample_bylevel", "1"},
{"colsample_bytree", "1"},
{"min_child_weight", "0.01"},
{"reg_alpha", "0"},
{"reg_lambda", "0"},
{"max_delta_step", "0"}};
param.Init(args);
for (size_t i = 0; i < kNCols; ++i) {
param.monotone_constraints.emplace_back(0);
@@ -178,7 +219,7 @@ TEST(GpuHist, EvaluateRootSplit) {
auto page = BuildEllpackPage(kNRows, kNCols);
BatchParam batch_param{};
GPUHistMakerDevice<GradientPairPrecise>
maker(0, page.get(), kNRows, param, kNCols, kNCols, true, batch_param);
maker(0, page.get(), {}, kNRows, param, kNCols, kNCols, true, batch_param);
// Initialize GPUHistMakerDevice::node_sum_gradients
maker.node_sum_gradients = {};
@@ -257,7 +298,6 @@ void TestHistogramIndexImpl() {
ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());
}
TEST(GpuHist, TestHistogramIndex) {