Use Booster context in DMatrix. (#8896)

- Pass context from booster to DMatrix.
- Use context instead of integer for `n_threads`.
- Check the consistency configuration for `max_bin`.
- Test for all combinations of initialization options.
This commit is contained in:
Jiaming Yuan
2023-04-28 21:47:14 +08:00
committed by GitHub
parent 1f9a57d17b
commit 08ce495b5d
67 changed files with 1283 additions and 935 deletions

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2018 by Contributors
/**
* Copyright 2018-2023 by XGBoost Contributors
* \author Rory Mitchell
*/
#pragma once
@@ -78,11 +78,12 @@ inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
*
* \return The gradient and diagonal Hessian entry for a given feature.
*/
inline std::pair<double, double> GetGradient(int group_idx, int num_group, int fidx,
const std::vector<GradientPair> &gpair,
inline std::pair<double, double> GetGradient(Context const *ctx, int group_idx, int num_group,
bst_feature_t fidx,
std::vector<GradientPair> const &gpair,
DMatrix *p_fmat) {
double sum_grad = 0.0, sum_hess = 0.0;
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
auto page = batch.GetView();
auto col = page[fidx];
const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -115,7 +116,7 @@ inline std::pair<double, double> GetGradientParallel(Context const *ctx, int gro
std::vector<double> sum_grad_tloc(ctx->Threads(), 0.0);
std::vector<double> sum_hess_tloc(ctx->Threads(), 0.0);
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
auto page = batch.GetView();
auto col = page[fidx];
const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -177,16 +178,16 @@ inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_
* \param in_gpair The gradient vector to be updated.
* \param p_fmat The input feature matrix.
*/
inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
float dw, std::vector<GradientPair> *in_gpair,
DMatrix *p_fmat, int32_t n_threads) {
inline void UpdateResidualParallel(Context const *ctx, bst_feature_t fidx, int group_idx,
int num_group, float dw, std::vector<GradientPair> *in_gpair,
DMatrix *p_fmat) {
if (dw == 0.0f) return;
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
auto page = batch.GetView();
auto col = page[fidx];
// update grad value
const auto num_row = static_cast<bst_omp_uint>(col.size());
common::ParallelFor(num_row, n_threads, [&](auto j) {
common::ParallelFor(num_row, ctx->Threads(), [&](auto j) {
GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
if (p.GetHess() < 0.0f) return;
p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
@@ -203,12 +204,12 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
* \param in_gpair The gradient vector to be updated.
* \param p_fmat The input feature matrix.
*/
inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias,
std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
int32_t n_threads) {
inline void UpdateBiasResidualParallel(Context const *ctx, int group_idx, int num_group,
float dbias, std::vector<GradientPair> *in_gpair,
DMatrix *p_fmat) {
if (dbias == 0.0f) return;
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
common::ParallelFor(ndata, n_threads, [&](auto i) {
common::ParallelFor(ndata, ctx->Threads(), [&](auto i) {
GradientPair &g = (*in_gpair)[i * num_group + group_idx];
if (g.GetHess() < 0.0f) return;
g += GradientPair(g.GetHess() * dbias, 0);
@@ -220,18 +221,16 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias
* in coordinate descent algorithms.
*/
class FeatureSelector {
protected:
int32_t n_threads_{-1};
public:
explicit FeatureSelector(int32_t n_threads) : n_threads_{n_threads} {}
FeatureSelector() = default;
/*! \brief factory method */
static FeatureSelector *Create(int choice, int32_t n_threads);
static FeatureSelector *Create(int choice);
/*! \brief virtual destructor */
virtual ~FeatureSelector() = default;
/**
* \brief Setting up the selector state prior to looping through features.
*
* \param ctx The booster context.
* \param model The model.
* \param gpair The gpair.
* \param p_fmat The feature matrix.
@@ -239,13 +238,12 @@ class FeatureSelector {
* \param lambda Regularisation lambda.
* \param param A parameter with algorithm-dependent use.
*/
virtual void Setup(const gbm::GBLinearModel &,
const std::vector<GradientPair> &,
DMatrix *,
float , float , int ) {}
virtual void Setup(Context const *, const gbm::GBLinearModel &,
const std::vector<GradientPair> &, DMatrix *, float, float, int) {}
/**
* \brief Select next coordinate to update.
*
* \param ctx Booster context
* \param iteration The iteration in a loop through features
* \param model The model.
* \param group_idx Zero-based index of the group.
@@ -256,11 +254,9 @@ class FeatureSelector {
*
* \return The index of the selected feature. -1 indicates none selected.
*/
virtual int NextFeature(int iteration,
const gbm::GBLinearModel &model,
int group_idx,
const std::vector<GradientPair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) = 0;
virtual int NextFeature(Context const *ctx, int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<GradientPair> &gpair, DMatrix *p_fmat,
float alpha, float lambda) = 0;
};
/**
@@ -269,9 +265,8 @@ class FeatureSelector {
class CyclicFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int , const std::vector<GradientPair> &,
DMatrix *, float, float) override {
int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
const std::vector<GradientPair> &, DMatrix *, float, float) override {
return iteration % model.learner_model_param->num_feature;
}
};
@@ -283,8 +278,7 @@ class CyclicFeatureSelector : public FeatureSelector {
class ShuffleFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
void Setup(const gbm::GBLinearModel &model,
const std::vector<GradientPair>&,
void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
DMatrix *, float, float, int) override {
if (feat_index_.size() == 0) {
feat_index_.resize(model.learner_model_param->num_feature);
@@ -293,9 +287,8 @@ class ShuffleFeatureSelector : public FeatureSelector {
std::shuffle(feat_index_.begin(), feat_index_.end(), common::GlobalRandom());
}
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int, const std::vector<GradientPair> &,
DMatrix *, float, float) override {
int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
const std::vector<GradientPair> &, DMatrix *, float, float) override {
return feat_index_[iteration % model.learner_model_param->num_feature];
}
@@ -310,9 +303,8 @@ class ShuffleFeatureSelector : public FeatureSelector {
class RandomFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
int NextFeature(int, const gbm::GBLinearModel &model,
int, const std::vector<GradientPair> &,
DMatrix *, float, float) override {
int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int,
const std::vector<GradientPair> &, DMatrix *, float, float) override {
return common::GlobalRandom()() % model.learner_model_param->num_feature;
}
};
@@ -329,8 +321,7 @@ class RandomFeatureSelector : public FeatureSelector {
class GreedyFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
void Setup(const gbm::GBLinearModel &model,
const std::vector<GradientPair> &,
void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
DMatrix *, float, float, int param) override {
top_k_ = static_cast<bst_uint>(param);
const bst_uint ngroup = model.learner_model_param->num_output_group;
@@ -344,7 +335,7 @@ class GreedyFeatureSelector : public FeatureSelector {
}
}
int NextFeature(int, const gbm::GBLinearModel &model,
int NextFeature(Context const* ctx, int, const gbm::GBLinearModel &model,
int group_idx, const std::vector<GradientPair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) override {
// k-th selected feature for a group
@@ -356,9 +347,9 @@ class GreedyFeatureSelector : public FeatureSelector {
const bst_omp_uint nfeat = model.learner_model_param->num_feature;
// Calculate univariate gradient sums
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
auto page = batch.GetView();
common::ParallelFor(nfeat, this->n_threads_, [&](bst_omp_uint i) {
common::ParallelFor(nfeat, ctx->Threads(), [&](bst_omp_uint i) {
const auto col = page[i];
const bst_uint ndata = col.size();
auto &sums = gpair_sums_[group_idx * nfeat + i];
@@ -406,9 +397,10 @@ class GreedyFeatureSelector : public FeatureSelector {
class ThriftyFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
void Setup(const gbm::GBLinearModel &model,
const std::vector<GradientPair> &gpair,
DMatrix *p_fmat, float alpha, float lambda, int param) override {
void Setup(Context const *ctx, const gbm::GBLinearModel &model,
const std::vector<GradientPair> &gpair, DMatrix *p_fmat, float alpha, float lambda,
int param) override {
top_k_ = static_cast<bst_uint>(param);
if (param <= 0) top_k_ = std::numeric_limits<bst_uint>::max();
const bst_uint ngroup = model.learner_model_param->num_output_group;
@@ -422,10 +414,10 @@ class ThriftyFeatureSelector : public FeatureSelector {
}
// Calculate univariate gradient sums
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
auto page = batch.GetView();
// column-parallel is usually fastaer than row-parallel
common::ParallelFor(nfeat, this->n_threads_, [&](auto i) {
common::ParallelFor(nfeat, ctx->Threads(), [&](auto i) {
const auto col = page[i];
const bst_uint ndata = col.size();
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
@@ -462,9 +454,8 @@ class ThriftyFeatureSelector : public FeatureSelector {
}
}
int NextFeature(int, const gbm::GBLinearModel &model,
int group_idx, const std::vector<GradientPair> &,
DMatrix *, float, float) override {
int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int group_idx,
const std::vector<GradientPair> &, DMatrix *, float, float) override {
// k-th selected feature for a group
auto k = counter_[group_idx]++;
// stop after either reaching top-N or going through all the features in a group
@@ -482,18 +473,18 @@ class ThriftyFeatureSelector : public FeatureSelector {
std::vector<std::pair<double, double>> gpair_sums_;
};
inline FeatureSelector *FeatureSelector::Create(int choice, int32_t n_threads) {
inline FeatureSelector *FeatureSelector::Create(int choice) {
switch (choice) {
case kCyclic:
return new CyclicFeatureSelector(n_threads);
return new CyclicFeatureSelector;
case kShuffle:
return new ShuffleFeatureSelector(n_threads);
return new ShuffleFeatureSelector;
case kThrifty:
return new ThriftyFeatureSelector(n_threads);
return new ThriftyFeatureSelector;
case kGreedy:
return new GreedyFeatureSelector(n_threads);
return new GreedyFeatureSelector;
case kRandom:
return new RandomFeatureSelector(n_threads);
return new RandomFeatureSelector;
default:
LOG(FATAL) << "unknown coordinate selector: " << choice;
}

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2018 by Contributors
/**
* Copyright 2018-2023 by XGBoost Contributors
* \author Rory Mitchell
*/
@@ -30,7 +30,7 @@ class CoordinateUpdater : public LinearUpdater {
tparam_.UpdateAllowUnknown(args)
};
cparam_.UpdateAllowUnknown(rest);
selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
monitor_.Init("CoordinateUpdater");
}
@@ -56,19 +56,17 @@ class CoordinateUpdater : public LinearUpdater {
auto dbias = static_cast<float>(tparam_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->Bias()[group_idx] += dbias;
UpdateBiasResidualParallel(group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
ctx_->Threads());
UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
}
// prepare for updating the weights
selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm, cparam_.top_k);
selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm, cparam_.top_k);
// update weights
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
int fidx = selector_->NextFeature
(i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
int fidx =
selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
}
@@ -76,8 +74,8 @@ class CoordinateUpdater : public LinearUpdater {
monitor_.Stop("UpdateFeature");
}
inline void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair,
DMatrix *p_fmat, gbm::GBLinearModel *model) {
void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model) {
const int ngroup = model->learner_model_param->num_output_group;
bst_float &w = (*model)[fidx][group_idx];
auto gradient = GetGradientParallel(ctx_, group_idx, ngroup, fidx,
@@ -87,8 +85,7 @@ class CoordinateUpdater : public LinearUpdater {
CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm));
w += dw;
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat,
ctx_->Threads());
UpdateResidualParallel(ctx_, fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
}
private:

View File

@@ -32,7 +32,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
void Configure(Args const& args) override {
tparam_.UpdateAllowUnknown(args);
coord_param_.UpdateAllowUnknown(args);
selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
monitor_.Init("GPUCoordinateUpdater");
}
@@ -53,7 +53,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
CHECK(p_fmat->SingleColBlock());
SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());
SparsePage const &batch = *(p_fmat->GetBatches<CSCPage>(ctx_).begin());
auto page = batch.GetView();
if (IsEmpty()) {
@@ -112,16 +112,15 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
this->UpdateBias(model);
monitor_.Stop("UpdateBias");
// prepare for updating the weights
selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
coord_param_.top_k);
selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm, coord_param_.top_k);
monitor_.Start("UpdateFeature");
for (uint32_t group_idx = 0; group_idx < model->learner_model_param->num_output_group;
++group_idx) {
for (auto i = 0U; i < model->learner_model_param->num_feature; i++) {
auto fidx = selector_->NextFeature(
i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
auto fidx =
selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, model);
}

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2018 by Contributors
/**
* Copyright 2018-2023 by XGBoost Contributors
* \author Tianqi Chen, Rory Mitchell
*/
@@ -21,7 +21,7 @@ class ShotgunUpdater : public LinearUpdater {
LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n"
<< "Supported options are: {cyclic, shuffle}";
}
selector_.reset(FeatureSelector::Create(param_.feature_selector, ctx_->Threads()));
selector_.reset(FeatureSelector::Create(param_.feature_selector));
}
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
@@ -45,18 +45,17 @@ class ShotgunUpdater : public LinearUpdater {
auto dbias = static_cast<bst_float>(param_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->Bias()[gid] += dbias;
UpdateBiasResidualParallel(gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
ctx_->Threads());
UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
}
// lock-free parallel updates of weights
selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
param_.reg_lambda_denorm, 0);
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx_)) {
auto page = batch.GetView();
const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
int ii = selector_->NextFeature(i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
int ii = selector_->NextFeature(ctx_, i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
param_.reg_alpha_denorm, param_.reg_lambda_denorm);
if (ii < 0) return;
const bst_uint fid = ii;