Remove distcol updater. (#5507)

Closes #5498.
This commit is contained in:
Jiaming Yuan
2020-04-10 12:52:56 +08:00
committed by GitHub
parent 7d52c0b8c2
commit bd653fad4c
5 changed files with 1 additions and 234 deletions

View File

@@ -1,67 +0,0 @@
/*!
* Copyright 2014 by Contributors
* \file bitmap.h
* \brief a simple implement of bitmap
* NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
* \author Tianqi Chen
*/
#ifndef XGBOOST_COMMON_BITMAP_H_
#define XGBOOST_COMMON_BITMAP_H_
#include <dmlc/omp.h>
#include <vector>
namespace xgboost {
namespace common {
/*! \brief bit map that contains set of bit indicators */
struct BitMap {
/*! \brief internal data structure */
std::vector<uint32_t> data;
/*!
* \brief resize the bitmap to be certain size
* \param size the size of bitmap
*/
inline void Resize(size_t size) {
data.resize((size + 31U) >> 5, 0);
}
/*!
* \brief query the i-th position of bitmap
* \param i the position in
*/
inline bool Get(size_t i) const {
return (data[i >> 5] >> (i & 31U)) & 1U;
}
/*!
* \brief set i-th position to true
* \param i position index
*/
inline void SetTrue(size_t i) {
data[i >> 5] |= (1 << (i & 31U));
}
/*! \brief initialize the value of bit map from vector of bool*/
inline void InitFromBool(const std::vector<int>& vec) {
this->Resize(vec.size());
// parallel over the full cases
auto nsize = static_cast<bst_omp_uint>(vec.size() / 32);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
uint32_t res = 0;
for (int k = 0; k < 32; ++k) {
uint32_t bit = vec[(i << 5) | k];
res |= (bit << k);
}
data[i] = res;
}
if (nsize != vec.size()) data.back() = 0;
for (size_t i = nsize; i < vec.size(); ++i) {
if (vec[i]) this->SetTrue(i);
}
}
/*! \brief clear the bitmap, set all places to false */
inline void Clear() {
std::fill(data.begin(), data.end(), 0U);
}
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_BITMAP_H_

View File

@@ -195,8 +195,7 @@ class GBTree : public GradientBooster {
void LoadModel(Json const& in) override;
bool AllowLazyCheckPoint() const override {
return model_.learner_model_param->num_output_group == 1 ||
tparam_.updater_seq.find("distcol") != std::string::npos;
return model_.learner_model_param->num_output_group == 1;
}
void PredictBatch(DMatrix* p_fmat,

View File

@@ -888,8 +888,6 @@ class LearnerImpl : public LearnerIO {
CHECK(tparam_.dsplit != DataSplitMode::kAuto)
<< "Precondition violated; dsplit cannot be 'auto' in distributed mode";
if (tparam_.dsplit == DataSplitMode::kCol) {
// 'distcol' updater hidden until it becomes functional again
// See discussion at https://github.com/dmlc/xgboost/issues/1832
LOG(FATAL) << "Column-wise data split is currently not supported.";
}
}

View File

@@ -17,7 +17,6 @@
#include "param.h"
#include "constraints.h"
#include "../common/random.h"
#include "../common/bitmap.h"
#include "split_evaluator.h"
namespace xgboost {
@@ -618,171 +617,10 @@ class ColMaker: public TreeUpdater {
};
};
// distributed column maker
class DistColMaker : public ColMaker {
public:
void Configure(const Args& args) override {
param_.UpdateAllowUnknown(args);
pruner_.reset(TreeUpdater::Create("prune", tparam_));
pruner_->Configure(args);
spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
spliteval_->Init(&param_);
}
char const* Name() const override {
return "distcol";
}
void Update(HostDeviceVector<GradientPair> *gpair,
DMatrix* dmat,
const std::vector<RegTree*> &trees) override {
CHECK_EQ(trees.size(), 1U) << "DistColMaker: only support one tree at a time";
this->LazyGetColumnDensity(dmat);
Builder builder(
param_,
colmaker_param_,
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone()),
interaction_constraints_, column_densities_);
// build the tree
builder.Update(gpair->ConstHostVector(), dmat, trees[0]);
//// prune the tree, note that pruner will sync the tree
pruner_->Update(gpair, dmat, trees);
// update position after the tree is pruned
builder.UpdatePosition(dmat, *trees[0]);
}
private:
class Builder : public ColMaker::Builder {
public:
explicit Builder(const TrainParam &param,
ColMakerTrainParam const &colmaker_train_param,
std::unique_ptr<SplitEvaluator> spliteval,
FeatureInteractionConstraintHost _interaction_constraints,
const std::vector<float> &column_densities)
: ColMaker::Builder(param, colmaker_train_param,
std::move(spliteval),
std::move(_interaction_constraints),
column_densities) {}
inline void UpdatePosition(DMatrix* p_fmat, const RegTree &tree) {
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
int nid = this->DecodePosition(ridx);
while (tree[nid].IsDeleted()) {
nid = tree[nid].Parent();
CHECK_GE(nid, 0);
}
this->position_[ridx] = nid;
}
}
protected:
void SetNonDefaultPosition(const std::vector<int> &qexpand, DMatrix *p_fmat,
const RegTree &tree) override {
// step 2, classify the non-default data into right places
std::vector<unsigned> fsplits;
for (int nid : qexpand) {
if (!tree[nid].IsLeaf()) {
fsplits.push_back(tree[nid].SplitIndex());
}
}
// get the candidate split index
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
while (fsplits.size() != 0 && fsplits.back() >= p_fmat->Info().num_col_) {
fsplits.pop_back();
}
// bitmap is only word concurrent, set to bool first
{
auto ndata = static_cast<bst_omp_uint>(this->position_.size());
boolmap_.resize(ndata);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
boolmap_[j] = 0;
}
}
for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
for (auto fid : fsplits) {
auto col = batch[fid];
const auto ndata = static_cast<bst_omp_uint>(col.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const bst_float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx);
if (!tree[nid].IsLeaf() && tree[nid].SplitIndex() == fid) {
if (fvalue < tree[nid].SplitCond()) {
if (!tree[nid].DefaultLeft()) boolmap_[ridx] = 1;
} else {
if (tree[nid].DefaultLeft()) boolmap_[ridx] = 1;
}
}
}
}
}
bitmap_.InitFromBool(boolmap_);
// communicate bitmap
rabit::Allreduce<rabit::op::BitOR>(dmlc::BeginPtr(bitmap_.data), bitmap_.data.size());
// get the new position
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
const int nid = this->DecodePosition(ridx);
if (bitmap_.Get(ridx)) {
CHECK(!tree[nid].IsLeaf()) << "inconsistent reduce information";
if (tree[nid].DefaultLeft()) {
this->SetEncodePosition(ridx, tree[nid].RightChild());
} else {
this->SetEncodePosition(ridx, tree[nid].LeftChild());
}
}
}
}
// synchronize the best solution of each node
void SyncBestSolution(const std::vector<int> &qexpand) override {
std::vector<SplitEntry> vec;
for (int nid : qexpand) {
for (int tid = 0; tid < this->nthread_; ++tid) {
this->snode_[nid].best.Update(this->stemp_[tid][nid].best);
}
vec.push_back(this->snode_[nid].best);
}
// TODO(tqchen) lazy version
// communicate best solution
reducer_.Allreduce(dmlc::BeginPtr(vec), vec.size());
// assign solution back
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
this->snode_[nid].best = vec[i];
}
}
private:
common::BitMap bitmap_;
std::vector<int> boolmap_;
rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer_;
};
// we directly introduce pruner here
std::unique_ptr<TreeUpdater> pruner_;
// training parameter
TrainParam param_;
// Cloned for each builder instantiation
std::unique_ptr<SplitEvaluator> spliteval_;
FeatureInteractionConstraintHost interaction_constraints_;
};
XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
.describe("Grow tree with parallelization over columns.")
.set_body([]() {
return new ColMaker();
});
XGBOOST_REGISTER_TREE_UPDATER(DistColMaker, "distcol")
.describe("Distributed column split version of tree maker.")
.set_body([]() {
return new DistColMaker();
});
} // namespace tree
} // namespace xgboost