Remove distcol updater. (#5507)

Closes #5498.
This commit is contained in:
Jiaming Yuan 2020-04-10 12:52:56 +08:00 committed by GitHub
parent 7d52c0b8c2
commit bd653fad4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1 additions and 234 deletions

View File

@ -157,7 +157,6 @@ Parameters for Tree Booster
- A comma separated string defining the sequence of tree updaters to run, providing a modular way to construct and to modify the trees. This is an advanced parameter that is usually set automatically, depending on some other parameters. However, it could be also set explicitly by a user. The following updaters exist:
- ``grow_colmaker``: non-distributed column-based construction of trees.
- ``distcol``: distributed tree construction with column-based data splitting mode.
- ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
- ``grow_local_histmaker``: based on local histogram counting.
- ``grow_skmaker``: uses the approximate sketching algorithm.

View File

@ -1,67 +0,0 @@
/*!
* Copyright 2014 by Contributors
* \file bitmap.h
* \brief a simple implement of bitmap
* NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
* \author Tianqi Chen
*/
#ifndef XGBOOST_COMMON_BITMAP_H_
#define XGBOOST_COMMON_BITMAP_H_
#include <dmlc/omp.h>
#include <vector>
namespace xgboost {
namespace common {
/*! \brief bit map that contains set of bit indicators */
struct BitMap {
/*! \brief internal data structure */
std::vector<uint32_t> data;
/*!
* \brief resize the bitmap to be certain size
* \param size the size of bitmap
*/
inline void Resize(size_t size) {
data.resize((size + 31U) >> 5, 0);
}
/*!
* \brief query the i-th position of bitmap
* \param i the position in
*/
inline bool Get(size_t i) const {
return (data[i >> 5] >> (i & 31U)) & 1U;
}
/*!
* \brief set i-th position to true
* \param i position index
*/
inline void SetTrue(size_t i) {
data[i >> 5] |= (1 << (i & 31U));
}
/*! \brief initialize the value of bit map from vector of bool*/
inline void InitFromBool(const std::vector<int>& vec) {
this->Resize(vec.size());
// parallel over the full cases
auto nsize = static_cast<bst_omp_uint>(vec.size() / 32);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
uint32_t res = 0;
for (int k = 0; k < 32; ++k) {
uint32_t bit = vec[(i << 5) | k];
res |= (bit << k);
}
data[i] = res;
}
if (nsize != vec.size()) data.back() = 0;
for (size_t i = nsize; i < vec.size(); ++i) {
if (vec[i]) this->SetTrue(i);
}
}
/*! \brief clear the bitmap, set all places to false */
inline void Clear() {
std::fill(data.begin(), data.end(), 0U);
}
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_BITMAP_H_

View File

@ -195,8 +195,7 @@ class GBTree : public GradientBooster {
void LoadModel(Json const& in) override;
bool AllowLazyCheckPoint() const override {
return model_.learner_model_param->num_output_group == 1 ||
tparam_.updater_seq.find("distcol") != std::string::npos;
return model_.learner_model_param->num_output_group == 1;
}
void PredictBatch(DMatrix* p_fmat,

View File

@ -888,8 +888,6 @@ class LearnerImpl : public LearnerIO {
CHECK(tparam_.dsplit != DataSplitMode::kAuto)
<< "Precondition violated; dsplit cannot be 'auto' in distributed mode";
if (tparam_.dsplit == DataSplitMode::kCol) {
// 'distcol' updater hidden until it becomes functional again
// See discussion at https://github.com/dmlc/xgboost/issues/1832
LOG(FATAL) << "Column-wise data split is currently not supported.";
}
}

View File

@ -17,7 +17,6 @@
#include "param.h"
#include "constraints.h"
#include "../common/random.h"
#include "../common/bitmap.h"
#include "split_evaluator.h"
namespace xgboost {
@ -618,171 +617,10 @@ class ColMaker: public TreeUpdater {
};
};
// distributed column maker
class DistColMaker : public ColMaker {
public:
void Configure(const Args& args) override {
param_.UpdateAllowUnknown(args);
pruner_.reset(TreeUpdater::Create("prune", tparam_));
pruner_->Configure(args);
spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
spliteval_->Init(&param_);
}
char const* Name() const override {
return "distcol";
}
void Update(HostDeviceVector<GradientPair> *gpair,
DMatrix* dmat,
const std::vector<RegTree*> &trees) override {
CHECK_EQ(trees.size(), 1U) << "DistColMaker: only support one tree at a time";
this->LazyGetColumnDensity(dmat);
Builder builder(
param_,
colmaker_param_,
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone()),
interaction_constraints_, column_densities_);
// build the tree
builder.Update(gpair->ConstHostVector(), dmat, trees[0]);
//// prune the tree, note that pruner will sync the tree
pruner_->Update(gpair, dmat, trees);
// update position after the tree is pruned
builder.UpdatePosition(dmat, *trees[0]);
}
private:
class Builder : public ColMaker::Builder {
public:
explicit Builder(const TrainParam &param,
ColMakerTrainParam const &colmaker_train_param,
std::unique_ptr<SplitEvaluator> spliteval,
FeatureInteractionConstraintHost _interaction_constraints,
const std::vector<float> &column_densities)
: ColMaker::Builder(param, colmaker_train_param,
std::move(spliteval),
std::move(_interaction_constraints),
column_densities) {}
inline void UpdatePosition(DMatrix* p_fmat, const RegTree &tree) {
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
int nid = this->DecodePosition(ridx);
while (tree[nid].IsDeleted()) {
nid = tree[nid].Parent();
CHECK_GE(nid, 0);
}
this->position_[ridx] = nid;
}
}
protected:
void SetNonDefaultPosition(const std::vector<int> &qexpand, DMatrix *p_fmat,
const RegTree &tree) override {
// step 2, classify the non-default data into right places
std::vector<unsigned> fsplits;
for (int nid : qexpand) {
if (!tree[nid].IsLeaf()) {
fsplits.push_back(tree[nid].SplitIndex());
}
}
// get the candidate split index
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
while (fsplits.size() != 0 && fsplits.back() >= p_fmat->Info().num_col_) {
fsplits.pop_back();
}
// bitmap is only word concurrent, set to bool first
{
auto ndata = static_cast<bst_omp_uint>(this->position_.size());
boolmap_.resize(ndata);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
boolmap_[j] = 0;
}
}
for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
for (auto fid : fsplits) {
auto col = batch[fid];
const auto ndata = static_cast<bst_omp_uint>(col.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const bst_float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx);
if (!tree[nid].IsLeaf() && tree[nid].SplitIndex() == fid) {
if (fvalue < tree[nid].SplitCond()) {
if (!tree[nid].DefaultLeft()) boolmap_[ridx] = 1;
} else {
if (tree[nid].DefaultLeft()) boolmap_[ridx] = 1;
}
}
}
}
}
bitmap_.InitFromBool(boolmap_);
// communicate bitmap
rabit::Allreduce<rabit::op::BitOR>(dmlc::BeginPtr(bitmap_.data), bitmap_.data.size());
// get the new position
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
const int nid = this->DecodePosition(ridx);
if (bitmap_.Get(ridx)) {
CHECK(!tree[nid].IsLeaf()) << "inconsistent reduce information";
if (tree[nid].DefaultLeft()) {
this->SetEncodePosition(ridx, tree[nid].RightChild());
} else {
this->SetEncodePosition(ridx, tree[nid].LeftChild());
}
}
}
}
// synchronize the best solution of each node
void SyncBestSolution(const std::vector<int> &qexpand) override {
std::vector<SplitEntry> vec;
for (int nid : qexpand) {
for (int tid = 0; tid < this->nthread_; ++tid) {
this->snode_[nid].best.Update(this->stemp_[tid][nid].best);
}
vec.push_back(this->snode_[nid].best);
}
// TODO(tqchen) lazy version
// communicate best solution
reducer_.Allreduce(dmlc::BeginPtr(vec), vec.size());
// assign solution back
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
this->snode_[nid].best = vec[i];
}
}
private:
common::BitMap bitmap_;
std::vector<int> boolmap_;
rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer_;
};
// we directly introduce pruner here
std::unique_ptr<TreeUpdater> pruner_;
// training parameter
TrainParam param_;
// Cloned for each builder instantiation
std::unique_ptr<SplitEvaluator> spliteval_;
FeatureInteractionConstraintHost interaction_constraints_;
};
XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
.describe("Grow tree with parallelization over columns.")
.set_body([]() {
return new ColMaker();
});
XGBOOST_REGISTER_TREE_UPDATER(DistColMaker, "distcol")
.describe("Distributed column split version of tree maker.")
.set_body([]() {
return new DistColMaker();
});
} // namespace tree
} // namespace xgboost