Support column split in multi-target hist (#9171)

This commit is contained in:
Rong Ou
2023-05-26 01:56:05 -07:00
committed by GitHub
parent acd363033e
commit 5b69534b43
17 changed files with 386 additions and 96 deletions

View File

@@ -33,7 +33,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();
auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
common::HistCollection hist;
std::vector<GradientPair> row_gpairs = {
{1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
@@ -167,7 +167,7 @@ TEST(HistEvaluator, Apply) {
param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
auto sampler = std::make_shared<common::ColumnSampler>();
auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
auto evaluator_ = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
CPUExpandEntry entry{0, 0};
entry.split.loss_chg = 10.0f;
@@ -195,7 +195,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {
// check the evaluator is returning the optimal split
std::vector<FeatureType> ft{FeatureType::kCategorical};
auto sampler = std::make_shared<common::ColumnSampler>();
HistEvaluator<CPUExpandEntry> evaluator{&ctx, &param_, info_, sampler};
HistEvaluator evaluator{&ctx, &param_, info_, sampler};
evaluator.InitRoot(GradStats{total_gpair_});
RegTree tree;
std::vector<CPUExpandEntry> entries(1);
@@ -225,7 +225,7 @@ auto CompareOneHotAndPartition(bool onehot) {
RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();
auto sampler = std::make_shared<common::ColumnSampler>();
auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
std::vector<CPUExpandEntry> entries(1);
for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
@@ -276,7 +276,7 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
info.num_col_ = 1;
info.feature_types = {FeatureType::kCategorical};
Context ctx;
auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param_, info, sampler};
auto evaluator = HistEvaluator{&ctx, &param_, info, sampler};
evaluator.InitRoot(GradStats{parent_sum_});
std::vector<CPUExpandEntry> entries(1);

View File

@@ -79,7 +79,7 @@ TEST(CPUMonoConstraint, Basic) {
auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
auto sampler = std::make_shared<common::ColumnSampler>();
HistEvaluator<CPUExpandEntry> evalutor{&ctx, &param, Xy->Info(), sampler};
HistEvaluator evalutor{&ctx, &param, Xy->Info(), sampler};
evalutor.InitRoot(GradStats{2.0, 2.0});
SplitEntry split;

View File

@@ -9,28 +9,20 @@
#include "../helpers.h"
namespace xgboost::tree {
std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols){
return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
}
std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(std::size_t rows) {
auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows);
auto& h_gradients = p_gradients->HostVector();
xgboost::SimpleLCG gen;
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
for (std::size_t i = 0; i < rows; ++i) {
auto grad = dist(&gen);
auto hess = dist(&gen);
h_gradients[i] = GradientPair{grad, hess};
std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols,
bool categorical = false) {
if (categorical) {
std::vector<FeatureType> ft(cols);
for (size_t i = 0; i < ft.size(); ++i) {
ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
}
return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
} else {
return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
}
return p_gradients;
}
TEST(GrowHistMaker, InteractionConstraint)
{
TEST(GrowHistMaker, InteractionConstraint) {
auto constexpr kRows = 32;
auto constexpr kCols = 16;
auto p_dmat = GenerateDMatrix(kRows, kCols);
@@ -74,8 +66,9 @@ TEST(GrowHistMaker, InteractionConstraint)
}
namespace {
void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_tree) {
auto p_dmat = GenerateDMatrix(rows, cols);
void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
RegTree const& expected_tree) {
auto p_dmat = GenerateDMatrix(rows, cols, categorical);
auto p_gradients = GenerateGradients(rows);
Context ctx;
ObjInfo task{ObjInfo::kRegression};
@@ -90,27 +83,21 @@ void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_t
param.Init(Args{});
updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
ASSERT_EQ(tree.NumExtraNodes(), 10);
ASSERT_EQ(tree[0].SplitIndex(), 1);
ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
FeatureMap fmap;
auto json = tree.DumpModel(fmap, false, "json");
auto expected_json = expected_tree.DumpModel(fmap, false, "json");
Json json{Object{}};
tree.SaveModel(&json);
Json expected_json{Object{}};
expected_tree.SaveModel(&expected_json);
ASSERT_EQ(json, expected_json);
}
} // anonymous namespace
TEST(GrowHistMaker, ColumnSplit) {
void TestColumnSplit(bool categorical) {
auto constexpr kRows = 32;
auto constexpr kCols = 16;
RegTree expected_tree{1u, kCols};
ObjInfo task{ObjInfo::kRegression};
{
auto p_dmat = GenerateDMatrix(kRows, kCols);
auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
auto p_gradients = GenerateGradients(kRows);
Context ctx;
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
@@ -121,6 +108,12 @@ TEST(GrowHistMaker, ColumnSplit) {
}
auto constexpr kWorldSize = 2;
RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit, kRows, kCols, std::cref(expected_tree));
RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, categorical,
std::cref(expected_tree));
}
} // anonymous namespace
TEST(GrowHistMaker, ColumnSplitNumerical) { TestColumnSplit(false); }
TEST(GrowHistMaker, ColumnSplitCategorical) { TestColumnSplit(true); }
} // namespace xgboost::tree

View File

@@ -194,11 +194,65 @@ void TestColumnSplitPartitioner(bst_target_t n_targets) {
auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner);
n_samples, n_features, base_rowid, Xy, min_value, mid_value,
mid_partitioner);
}
} // anonymous namespace
TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEntry>(1); }
TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
namespace {
void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
RegTree const& expected_tree) {
auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
auto p_gradients = GenerateGradients(rows, n_targets);
Context ctx;
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
std::vector<HostDeviceVector<bst_node_t>> position(1);
std::unique_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
RegTree tree{n_targets, cols};
TrainParam param;
param.Init(Args{});
updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
Json json{Object{}};
tree.SaveModel(&json);
Json expected_json{Object{}};
expected_tree.SaveModel(&expected_json);
ASSERT_EQ(json, expected_json);
}
void TestColumnSplit(bst_target_t n_targets) {
auto constexpr kRows = 32;
auto constexpr kCols = 16;
RegTree expected_tree{n_targets, kCols};
ObjInfo task{ObjInfo::kRegression};
{
auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto p_gradients = GenerateGradients(kRows, n_targets);
Context ctx;
std::unique_ptr<TreeUpdater> updater{
TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
std::vector<HostDeviceVector<bst_node_t>> position(1);
TrainParam param;
param.Init(Args{});
updater->Update(&param, p_gradients.get(), Xy.get(), position, {&expected_tree});
}
auto constexpr kWorldSize = 2;
RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, n_targets,
std::cref(expected_tree));
}
} // anonymous namespace
TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
} // namespace xgboost::tree