Remove omp_get_max_threads in gbm and linear. (#7537)

* Use ctx in gbm.

* Use ctx threads in gbm and linear.
This commit is contained in:
Jiaming Yuan
2022-01-05 03:28:52 +08:00
committed by GitHub
parent eea094e1bc
commit 28af6f9abb
12 changed files with 124 additions and 135 deletions

View File

@@ -149,21 +149,21 @@ GetGradientParallel(GenericParameter const *ctx, int group_idx, int num_group,
*/
inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_group,
const std::vector<GradientPair> &gpair,
DMatrix *p_fmat) {
double sum_grad = 0.0, sum_hess = 0.0;
DMatrix *p_fmat, int32_t n_threads) {
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
dmlc::OMPException exc;
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
for (bst_omp_uint i = 0; i < ndata; ++i) {
exc.Run([&]() {
auto &p = gpair[i * num_group + group_idx];
if (p.GetHess() >= 0.0f) {
sum_grad += p.GetGrad();
sum_hess += p.GetHess();
}
});
}
exc.Rethrow();
std::vector<double> sum_grad_tloc(n_threads, 0);
std::vector<double> sum_hess_tloc(n_threads, 0);
common::ParallelFor(ndata, n_threads, [&](auto i) {
auto tid = omp_get_thread_num();
auto &p = gpair[i * num_group + group_idx];
if (p.GetHess() >= 0.0f) {
sum_grad_tloc[tid] += p.GetGrad();
sum_hess_tloc[tid] += p.GetHess();
}
});
double sum_grad = std::accumulate(sum_grad_tloc.cbegin(), sum_grad_tloc.cend(), 0.0);
double sum_hess = std::accumulate(sum_hess_tloc.cbegin(), sum_hess_tloc.cend(), 0.0);
return std::make_pair(sum_grad, sum_hess);
}
@@ -179,23 +179,18 @@ inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_
*/
inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
float dw, std::vector<GradientPair> *in_gpair,
DMatrix *p_fmat) {
DMatrix *p_fmat, int32_t n_threads) {
if (dw == 0.0f) return;
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
auto page = batch.GetView();
auto col = page[fidx];
// update grad value
const auto num_row = static_cast<bst_omp_uint>(col.size());
dmlc::OMPException exc;
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < num_row; ++j) {
exc.Run([&]() {
GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
if (p.GetHess() < 0.0f) return;
p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
});
}
exc.Rethrow();
common::ParallelFor(num_row, n_threads, [&](auto j) {
GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
if (p.GetHess() < 0.0f) return;
p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
});
}
}
@@ -209,20 +204,15 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
* \param p_fmat The input feature matrix.
*/
inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias,
std::vector<GradientPair> *in_gpair,
DMatrix *p_fmat) {
std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
int32_t n_threads) {
if (dbias == 0.0f) return;
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
dmlc::OMPException exc;
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
exc.Run([&]() {
GradientPair &g = (*in_gpair)[i * num_group + group_idx];
if (g.GetHess() < 0.0f) return;
g += GradientPair(g.GetHess() * dbias, 0);
});
}
exc.Rethrow();
common::ParallelFor(ndata, n_threads, [&](auto i) {
GradientPair &g = (*in_gpair)[i * num_group + group_idx];
if (g.GetHess() < 0.0f) return;
g += GradientPair(g.GetHess() * dbias, 0);
});
}
/**
@@ -230,9 +220,13 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias
* in coordinate descent algorithms.
*/
class FeatureSelector {
protected:
int32_t n_threads_{-1};
public:
explicit FeatureSelector(int32_t n_threads) : n_threads_{n_threads} {}
/*! \brief factory method */
static FeatureSelector *Create(int choice);
static FeatureSelector *Create(int choice, int32_t n_threads);
/*! \brief virtual destructor */
virtual ~FeatureSelector() = default;
/**
@@ -274,6 +268,7 @@ class FeatureSelector {
*/
class CyclicFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int , const std::vector<GradientPair> &,
DMatrix *, float, float) override {
@@ -287,6 +282,7 @@ class CyclicFeatureSelector : public FeatureSelector {
*/
class ShuffleFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
void Setup(const gbm::GBLinearModel &model,
const std::vector<GradientPair>&,
DMatrix *, float, float, int) override {
@@ -313,6 +309,7 @@ class ShuffleFeatureSelector : public FeatureSelector {
*/
class RandomFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
int NextFeature(int, const gbm::GBLinearModel &model,
int, const std::vector<GradientPair> &,
DMatrix *, float, float) override {
@@ -331,6 +328,7 @@ class RandomFeatureSelector : public FeatureSelector {
*/
class GreedyFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
void Setup(const gbm::GBLinearModel &model,
const std::vector<GradientPair> &,
DMatrix *, float, float, int param) override {
@@ -360,7 +358,7 @@ class GreedyFeatureSelector : public FeatureSelector {
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
auto page = batch.GetView();
common::ParallelFor(nfeat, [&](bst_omp_uint i) {
common::ParallelFor(nfeat, this->n_threads_, [&](bst_omp_uint i) {
const auto col = page[i];
const bst_uint ndata = col.size();
auto &sums = gpair_sums_[group_idx * nfeat + i];
@@ -407,6 +405,7 @@ class GreedyFeatureSelector : public FeatureSelector {
*/
class ThriftyFeatureSelector : public FeatureSelector {
public:
using FeatureSelector::FeatureSelector;
void Setup(const gbm::GBLinearModel &model,
const std::vector<GradientPair> &gpair,
DMatrix *p_fmat, float alpha, float lambda, int param) override {
@@ -426,7 +425,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
auto page = batch.GetView();
// column-parallel is usually fastaer than row-parallel
common::ParallelFor(nfeat, [&](bst_omp_uint i) {
common::ParallelFor(nfeat, this->n_threads_, [&](auto i) {
const auto col = page[i];
const bst_uint ndata = col.size();
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
@@ -483,18 +482,18 @@ class ThriftyFeatureSelector : public FeatureSelector {
std::vector<std::pair<double, double>> gpair_sums_;
};
inline FeatureSelector *FeatureSelector::Create(int choice) {
inline FeatureSelector *FeatureSelector::Create(int choice, int32_t n_threads) {
switch (choice) {
case kCyclic:
return new CyclicFeatureSelector();
return new CyclicFeatureSelector(n_threads);
case kShuffle:
return new ShuffleFeatureSelector();
return new ShuffleFeatureSelector(n_threads);
case kThrifty:
return new ThriftyFeatureSelector();
return new ThriftyFeatureSelector(n_threads);
case kGreedy:
return new GreedyFeatureSelector();
return new GreedyFeatureSelector(n_threads);
case kRandom:
return new RandomFeatureSelector();
return new RandomFeatureSelector(n_threads);
default:
LOG(FATAL) << "unknown coordinate selector: " << choice;
}

View File

@@ -17,7 +17,7 @@ LinearUpdater* LinearUpdater::Create(const std::string& name, GenericParameter c
LOG(FATAL) << "Unknown linear updater " << name;
}
auto p_linear = (e->body)();
p_linear->learner_param_ = lparam;
p_linear->ctx_ = lparam;
return p_linear;
}

View File

@@ -30,7 +30,7 @@ class CoordinateUpdater : public LinearUpdater {
tparam_.UpdateAllowUnknown(args)
};
cparam_.UpdateAllowUnknown(rest);
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
monitor_.Init("CoordinateUpdater");
}
@@ -51,13 +51,13 @@ class CoordinateUpdater : public LinearUpdater {
const int ngroup = model->learner_model_param->num_output_group;
// update bias
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
auto grad = GetBiasGradientParallel(group_idx, ngroup,
in_gpair->ConstHostVector(), p_fmat);
auto grad = GetBiasGradientParallel(group_idx, ngroup, in_gpair->ConstHostVector(), p_fmat,
ctx_->Threads());
auto dbias = static_cast<float>(tparam_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->Bias()[group_idx] += dbias;
UpdateBiasResidualParallel(group_idx, ngroup,
dbias, &in_gpair->HostVector(), p_fmat);
UpdateBiasResidualParallel(group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
ctx_->Threads());
}
// prepare for updating the weights
selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
@@ -80,14 +80,15 @@ class CoordinateUpdater : public LinearUpdater {
DMatrix *p_fmat, gbm::GBLinearModel *model) {
const int ngroup = model->learner_model_param->num_output_group;
bst_float &w = (*model)[fidx][group_idx];
auto gradient = GetGradientParallel(learner_param_, group_idx, ngroup, fidx,
auto gradient = GetGradientParallel(ctx_, group_idx, ngroup, fidx,
*in_gpair, p_fmat);
auto dw = static_cast<float>(
tparam_.learning_rate *
CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm));
w += dw;
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat,
ctx_->Threads());
}
private:

View File

@@ -32,7 +32,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
void Configure(Args const& args) override {
tparam_.UpdateAllowUnknown(args);
coord_param_.UpdateAllowUnknown(args);
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
monitor_.Init("GPUCoordinateUpdater");
}
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
}
void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
if (learner_param_->gpu_id < 0) return;
if (ctx_->gpu_id < 0) return;
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
@@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
return;
}
dh::safe_cuda(cudaSetDevice(learner_param_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
// The begin and end indices for the section of each column associated with
// this device
std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -103,7 +103,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
monitor_.Start("UpdateGpair");
auto &in_gpair_host = in_gpair->ConstHostVector();
// Update gpair
if (learner_param_->gpu_id >= 0) {
if (ctx_->gpu_id >= 0) {
this->UpdateGpair(in_gpair_host);
}
monitor_.Stop("UpdateGpair");
@@ -134,7 +134,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
++group_idx) {
// Get gradient
auto grad = GradientPair(0, 0);
if (learner_param_->gpu_id >= 0) {
if (ctx_->gpu_id >= 0) {
grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
}
auto dbias = static_cast<float>(
@@ -143,7 +143,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
model->Bias()[group_idx] += dbias;
// Update residual
if (learner_param_->gpu_id >= 0) {
if (ctx_->gpu_id >= 0) {
UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
}
}
@@ -155,7 +155,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
bst_float &w = (*model)[fidx][group_idx];
// Get gradient
auto grad = GradientPair(0, 0);
if (learner_param_->gpu_id >= 0) {
if (ctx_->gpu_id >= 0) {
grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
}
auto dw = static_cast<float>(tparam_.learning_rate *
@@ -164,14 +164,14 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
tparam_.reg_lambda_denorm));
w += dw;
if (learner_param_->gpu_id >= 0) {
if (ctx_->gpu_id >= 0) {
UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
}
}
// This needs to be public because of the __device__ lambda.
GradientPair GetBiasGradient(int group_idx, int num_group) {
dh::safe_cuda(cudaSetDevice(learner_param_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
auto counting = thrust::make_counting_iterator(0ull);
auto f = [=] __device__(size_t idx) {
return idx * num_group + group_idx;
@@ -195,7 +195,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda.
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
dh::safe_cuda(cudaSetDevice(learner_param_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);

View File

@@ -21,7 +21,7 @@ class ShotgunUpdater : public LinearUpdater {
LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n"
<< "Supported options are: {cyclic, shuffle}";
}
selector_.reset(FeatureSelector::Create(param_.feature_selector));
selector_.reset(FeatureSelector::Create(param_.feature_selector, ctx_->Threads()));
}
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
@@ -40,12 +40,13 @@ class ShotgunUpdater : public LinearUpdater {
// update bias
for (int gid = 0; gid < ngroup; ++gid) {
auto grad = GetBiasGradientParallel(gid, ngroup,
in_gpair->ConstHostVector(), p_fmat);
auto grad = GetBiasGradientParallel(gid, ngroup, in_gpair->ConstHostVector(), p_fmat,
ctx_->Threads());
auto dbias = static_cast<bst_float>(param_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->Bias()[gid] += dbias;
UpdateBiasResidualParallel(gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
UpdateBiasResidualParallel(gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
ctx_->Threads());
}
// lock-free parallel updates of weights
@@ -54,42 +55,35 @@ class ShotgunUpdater : public LinearUpdater {
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
auto page = batch.GetView();
const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
dmlc::OMPException exc;
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
exc.Run([&]() {
int ii = selector_->NextFeature
(i, *model, 0, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
param_.reg_lambda_denorm);
if (ii < 0) return;
const bst_uint fid = ii;
auto col = page[ii];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (auto& c : col) {
const GradientPair &p = gpair[c.index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
const bst_float v = c.fvalue;
sum_grad += p.GetGrad() * v;
sum_hess += p.GetHess() * v * v;
}
bst_float &w = (*model)[fid][gid];
auto dw = static_cast<bst_float>(
param_.learning_rate *
CoordinateDelta(sum_grad, sum_hess, w, param_.reg_alpha_denorm,
param_.reg_lambda_denorm));
if (dw == 0.f) continue;
w += dw;
// update grad values
for (auto& c : col) {
GradientPair &p = gpair[c.index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
}
common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
int ii = selector_->NextFeature(i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
param_.reg_alpha_denorm, param_.reg_lambda_denorm);
if (ii < 0) return;
const bst_uint fid = ii;
auto col = page[ii];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (auto &c : col) {
const GradientPair &p = gpair[c.index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
const bst_float v = c.fvalue;
sum_grad += p.GetGrad() * v;
sum_hess += p.GetHess() * v * v;
}
});
}
exc.Rethrow();
bst_float &w = (*model)[fid][gid];
auto dw = static_cast<bst_float>(
param_.learning_rate * CoordinateDelta(sum_grad, sum_hess, w, param_.reg_alpha_denorm,
param_.reg_lambda_denorm));
if (dw == 0.f) continue;
w += dw;
// update grad values
for (auto &c : col) {
GradientPair &p = gpair[c.index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
}
}
});
}
}