Support categorical data for hist. (#7695)

* Extract partitioner from hist.
* Implement categorical data support by passing the gradient index directly into the partitioner.
* Organize/update document.
* Remove code for negative hessian.
This commit is contained in:
Jiaming Yuan
2022-02-25 03:47:14 +08:00
committed by GitHub
parent f60d95b0ba
commit 83a66b4994
15 changed files with 402 additions and 498 deletions

View File

@@ -40,7 +40,7 @@ template <typename GradientSumT> void TestEvaluateSplits() {
std::iota(row_indices.begin(), row_indices.end(), 0);
row_set_collection.Init();
auto hist_builder = GHistBuilder<GradientSumT>(gmat.cut.Ptrs().back());
auto hist_builder = common::GHistBuilder<GradientSumT>(gmat.cut.Ptrs().back());
hist.Init(gmat.cut.Ptrs().back());
hist.AddHistRow(0);
hist.AllocateAllData();
@@ -94,7 +94,7 @@ TEST(HistEvaluator, Apply) {
RegTree tree;
int static constexpr kNRows = 8, kNCols = 16;
TrainParam param;
param.UpdateAllowUnknown(Args{{}});
param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
auto sampler = std::make_shared<common::ColumnSampler>();
auto evaluator_ = HistEvaluator<float, CPUExpandEntry>{param, dmat->Info(), 4, sampler,
@@ -102,12 +102,22 @@ TEST(HistEvaluator, Apply) {
CPUExpandEntry entry{0, 0, 10.0f};
entry.split.left_sum = GradStats{0.4, 0.6f};
entry.split.right_sum = GradStats{0.5, 0.7f};
entry.split.right_sum = GradStats{0.5, 0.5f};
evaluator_.ApplyTreeSplit(entry, &tree);
ASSERT_EQ(tree.NumExtraNodes(), 2);
ASSERT_EQ(tree.Stat(tree[0].LeftChild()).sum_hess, 0.6f);
ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.7f);
ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.5f);
{
RegTree tree;
entry.split.is_cat = true;
entry.split.split_value = 1.0;
evaluator_.ApplyTreeSplit(entry, &tree);
auto l = entry.split.left_sum;
ASSERT_NEAR(tree[1].LeafValue(), -l.sum_grad / l.sum_hess * param.learning_rate, kRtEps);
ASSERT_NEAR(tree[2].LeafValue(), -param.learning_rate, kRtEps);
}
}
TEST_F(TestPartitionBasedSplit, CPUHist) {

View File

@@ -1,26 +1,14 @@
/*!
* Copyright 2021 XGBoost contributors
* Copyright 2021-2022, XGBoost contributors.
*/
#include <gtest/gtest.h>
#include "../../../src/tree/updater_approx.h"
#include "../helpers.h"
#include "test_partitioner.h"
namespace xgboost {
namespace tree {
namespace {
void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntry> *candidates) {
tree->ExpandNode(
/*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
/*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
/*left_sum=*/0.0f,
/*right_sum=*/0.0f);
candidates->front().split.split_value = split_value;
candidates->front().split.sindex = 0;
candidates->front().split.sindex |= (1U << 31);
}
} // anonymous namespace
TEST(Approx, Partitioner) {
size_t n_samples = 1024, n_features = 1, base_rowid = 0;
ApproxRowPartitioner partitioner{n_samples, base_rowid};

View File

@@ -0,0 +1,21 @@
/*!
* Copyright 2021-2022, XGBoost contributors.
*/
#include <xgboost/tree_model.h>
#include <vector>
#include "../../../src/tree/hist/expand_entry.h"
namespace xgboost {
namespace tree {
inline void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntry> *candidates) {
tree->ExpandNode(
/*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
/*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
/*left_sum=*/0.0f,
/*right_sum=*/0.0f);
candidates->front().split.split_value = split_value;
candidates->front().split.sindex = 0;
candidates->front().split.sindex |= (1U << 31);
}
} // namespace tree
} // namespace xgboost

View File

@@ -1,18 +1,19 @@
/*!
* Copyright 2018-2022 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/host_device_vector.h>
#include <xgboost/tree_updater.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <vector>
#include <string>
#include <vector>
#include "../helpers.h"
#include "../../../src/tree/param.h"
#include "../../../src/tree/updater_quantile_hist.h"
#include "../../../src/tree/split_evaluator.h"
#include "../../../src/tree/updater_quantile_hist.h"
#include "../helpers.h"
#include "test_partitioner.h"
#include "xgboost/data.h"
namespace xgboost {
@@ -94,130 +95,6 @@ class QuantileHistMock : public QuantileHistMaker {
}
}
}
void TestInitDataSampling(const GHistIndexMatrix& gmat,
std::vector<GradientPair>* gpair,
DMatrix* p_fmat,
const RegTree& tree) {
// check SimpleSkip
size_t initial_seed = 777;
std::linear_congruential_engine<std::uint_fast64_t, 16807, 0,
static_cast<uint64_t>(1) << 63 > eng_first(initial_seed);
for (size_t i = 0; i < 100; ++i) {
eng_first();
}
uint64_t initial_seed_th = RandomReplace::SimpleSkip(100, initial_seed, 16807, RandomReplace::kMod);
std::linear_congruential_engine<std::uint_fast64_t, RandomReplace::kBase, 0,
RandomReplace::kMod > eng_second(initial_seed_th);
ASSERT_EQ(eng_first(), eng_second());
const size_t nthreads = omp_get_num_threads();
// save state of global rng engine
auto initial_rnd = common::GlobalRandom();
std::vector<size_t> unused_rows_cpy = this->unused_rows_;
RealImpl::InitData(gmat, *p_fmat, tree, gpair);
std::vector<size_t> row_indices_initial = *(this->row_set_collection_.Data());
std::vector<size_t> unused_row_indices_initial = this->unused_rows_;
ASSERT_EQ(row_indices_initial.size(), p_fmat->Info().num_row_);
auto check_each_row_occurs_in_one_of_arrays = [](const std::vector<size_t>& first,
const std::vector<size_t>& second,
size_t nrows) {
ASSERT_EQ(first.size(), nrows);
ASSERT_EQ(second.size(), 0);
};
check_each_row_occurs_in_one_of_arrays(row_indices_initial, unused_row_indices_initial,
p_fmat->Info().num_row_);
for (size_t i_nthreads = 1; i_nthreads < 4; ++i_nthreads) {
omp_set_num_threads(i_nthreads);
// return initial state of global rng engine
common::GlobalRandom() = initial_rnd;
this->unused_rows_ = unused_rows_cpy;
RealImpl::InitData(gmat, *p_fmat, tree, gpair);
std::vector<size_t>& row_indices = *(this->row_set_collection_.Data());
ASSERT_EQ(row_indices_initial.size(), row_indices.size());
for (size_t i = 0; i < row_indices_initial.size(); ++i) {
ASSERT_EQ(row_indices_initial[i], row_indices[i]);
}
std::vector<size_t>& unused_row_indices = this->unused_rows_;
ASSERT_EQ(unused_row_indices_initial.size(), unused_row_indices.size());
for (size_t i = 0; i < unused_row_indices_initial.size(); ++i) {
ASSERT_EQ(unused_row_indices_initial[i], unused_row_indices[i]);
}
check_each_row_occurs_in_one_of_arrays(row_indices, unused_row_indices,
p_fmat->Info().num_row_);
}
omp_set_num_threads(nthreads);
}
void TestApplySplit(const RegTree& tree) {
std::vector<GradientPair> row_gpairs =
{ {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
{0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
int32_t constexpr kMaxBins = 4;
// try out different sparsity to get different number of missing values
for (double sparsity : {0.0, 0.1, 0.2}) {
// kNRows samples with kNCols features
auto dmat = RandomDataGenerator(kNRows, kNCols, sparsity).Seed(3).GenerateDMatrix();
float sparse_th = 0.0;
GHistIndexMatrix gmat{dmat.get(), kMaxBins, sparse_th, false, common::OmpGetNumThreads(0)};
ColumnMatrix cm;
// treat everything as dense, as this is what we intend to test here
cm.Init(gmat, sparse_th, common::OmpGetNumThreads(0));
RealImpl::InitData(gmat, *dmat, tree, &row_gpairs);
const size_t num_row = dmat->Info().num_row_;
// split by feature 0
const size_t bin_id_min = gmat.cut.Ptrs()[0];
const size_t bin_id_max = gmat.cut.Ptrs()[1];
// attempt to split at different bins
for (size_t split = 0; split < 4; split++) {
size_t left_cnt = 0, right_cnt = 0;
// manually compute how many samples go left or right
for (size_t rid = 0; rid < num_row; ++rid) {
for (size_t offset = gmat.row_ptr[rid]; offset < gmat.row_ptr[rid + 1]; ++offset) {
const size_t bin_id = gmat.index[offset];
if (bin_id >= bin_id_min && bin_id < bin_id_max) {
if (bin_id <= split) {
left_cnt++;
} else {
right_cnt++;
}
}
}
}
// if any were missing due to sparsity, we add them to the left or to the right
size_t missing = kNRows - left_cnt - right_cnt;
if (tree[0].DefaultLeft()) {
left_cnt += missing;
} else {
right_cnt += missing;
}
// have one node with kNRows (=8 at the moment) rows, just one task
RealImpl::partition_builder_.Init(1, 1, [&](size_t node_in_set) {
return 1;
});
const size_t task_id = RealImpl::partition_builder_.GetTaskIdx(0, 0);
RealImpl::partition_builder_.AllocateForTask(task_id);
if (cm.AnyMissing()) {
RealImpl::partition_builder_.template Partition<uint8_t, true>(0, 0, common::Range1d(0, kNRows),
split, cm, tree, this->row_set_collection_[0].begin);
} else {
RealImpl::partition_builder_.template Partition<uint8_t, false>(0, 0, common::Range1d(0, kNRows),
split, cm, tree, this->row_set_collection_[0].begin);
}
RealImpl::partition_builder_.CalculateRowOffsets();
ASSERT_EQ(RealImpl::partition_builder_.GetNLeftElems(0), left_cnt);
ASSERT_EQ(RealImpl::partition_builder_.GetNRightElems(0), right_cnt);
}
}
}
};
int static constexpr kNRows = 8, kNCols = 16;
@@ -262,33 +139,6 @@ class QuantileHistMock : public QuantileHistMaker {
float_builder_->TestInitData(gmat, &gpair, dmat_.get(), tree);
}
}
void TestInitDataSampling() {
int32_t constexpr kMaxBins = 4;
GHistIndexMatrix gmat{dmat_.get(), kMaxBins, 0.0f, false, common::OmpGetNumThreads(0)};
RegTree tree = RegTree();
tree.param.UpdateAllowUnknown(cfg_);
std::vector<GradientPair> gpair =
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
if (double_builder_) {
double_builder_->TestInitDataSampling(gmat, &gpair, dmat_.get(), tree);
} else {
float_builder_->TestInitDataSampling(gmat, &gpair, dmat_.get(), tree);
}
}
void TestApplySplit() {
RegTree tree = RegTree();
tree.param.UpdateAllowUnknown(cfg_);
if (double_builder_) {
double_builder_->TestApplySplit(tree);
} else {
float_builder_->TestApplySplit(tree);
}
}
};
TEST(QuantileHist, InitData) {
@@ -301,30 +151,62 @@ TEST(QuantileHist, InitData) {
maker_float.TestInitData();
}
TEST(QuantileHist, InitDataSampling) {
const float subsample = 0.5;
std::vector<std::pair<std::string, std::string>> cfg
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
{"subsample", std::to_string(subsample)}};
QuantileHistMock maker(cfg);
maker.TestInitDataSampling();
const bool single_precision_histogram = true;
QuantileHistMock maker_float(cfg, single_precision_histogram);
maker_float.TestInitDataSampling();
}
TEST(QuantileHist, Partitioner) {
size_t n_samples = 1024, n_features = 1, base_rowid = 0;
GenericParameter ctx;
ctx.InitAllowUnknown(Args{});
TEST(QuantileHist, ApplySplit) {
std::vector<std::pair<std::string, std::string>> cfg
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
{"split_evaluator", "elastic_net"},
{"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"},
{"min_child_weight", "0"}};
QuantileHistMock maker(cfg);
maker.TestApplySplit();
const bool single_precision_histogram = true;
QuantileHistMock maker_float(cfg, single_precision_histogram);
maker_float.TestApplySplit();
}
HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
ASSERT_EQ(partitioner.base_rowid, base_rowid);
ASSERT_EQ(partitioner.Size(), 1);
ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
auto grad = GenerateRandomGradients(n_samples);
std::vector<float> hess(grad.Size());
std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(),
[](auto gpair) { return gpair.GetHess(); });
for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, 0.5})) {
bst_feature_t const split_ind = 0;
common::ColumnMatrix column_indices;
column_indices.Init(page, 0.5, ctx.Threads());
{
auto min_value = page.cut.MinValues()[split_ind];
RegTree tree;
HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
GetSplit(&tree, min_value, &candidates);
partitioner.UpdatePosition<false, true>(&ctx, page, column_indices, candidates, &tree);
ASSERT_EQ(partitioner.Size(), 3);
ASSERT_EQ(partitioner[1].Size(), 0);
ASSERT_EQ(partitioner[2].Size(), n_samples);
}
{
HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
auto ptr = page.cut.Ptrs()[split_ind + 1];
float split_value = page.cut.Values().at(ptr / 2);
RegTree tree;
GetSplit(&tree, split_value, &candidates);
auto left_nidx = tree[RegTree::kRoot].LeftChild();
partitioner.UpdatePosition<false, true>(&ctx, page, column_indices, candidates, &tree);
auto elem = partitioner[left_nidx];
ASSERT_LT(elem.Size(), n_samples);
ASSERT_GT(elem.Size(), 1);
for (auto it = elem.begin; it != elem.end; ++it) {
auto value = page.cut.Values().at(page.index[*it]);
ASSERT_LE(value, split_value);
}
auto right_nidx = tree[RegTree::kRoot].RightChild();
elem = partitioner[right_nidx];
for (auto it = elem.begin; it != elem.end; ++it) {
auto value = page.cut.Values().at(page.index[*it]);
ASSERT_GT(value, split_value) << *it;
}
}
}
}
} // namespace tree
} // namespace xgboost