Distributed Fast Histogram Algorithm (#4011)
* add back train method but mark as deprecated * add back train method but mark as deprecated * add back train method but mark as deprecated * fix scalastyle error * fix scalastyle error * fix scalastyle error * fix scalastyle error * init * allow hist algo * more changes * temp * update * remove hist sync * udpate rabit * change hist size * change the histogram * update kfactor * sync per node stats * temp * update * final * code clean * update rabit * more cleanup * fix errors * fix failed tests * enforce c++11 * fix lint issue * broadcast subsampled feature correctly * revert some changes * fix lint issue * enable monotone and interaction constraints * don't specify default for monotone and interactions * update docs
This commit is contained in:
@@ -83,9 +83,9 @@ void HistCutMatrix::Init
|
||||
summary_array[i].Reserve(max_num_bins * kFactor);
|
||||
summary_array[i].SetPrune(out, max_num_bins * kFactor);
|
||||
}
|
||||
CHECK_EQ(summary_array.size(), in_sketchs->size());
|
||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
|
||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||
|
||||
this->min_val.resize(sketchs.size());
|
||||
row_ptr.push_back(0);
|
||||
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
|
||||
@@ -479,14 +479,14 @@ void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
|
||||
|
||||
#pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided)
|
||||
for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
|
||||
const size_t istart = iblock*block_size;
|
||||
const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size);
|
||||
const size_t istart = iblock * block_size;
|
||||
const size_t iend = (((iblock + 1) * block_size > size) ? size : istart + block_size);
|
||||
|
||||
const size_t bin = 2*thread_init_[0]*nbins_;
|
||||
memcpy(hist_data + istart, (data + bin + istart), sizeof(double)*(iend - istart));
|
||||
const size_t bin = 2 * thread_init_[0] * nbins_;
|
||||
memcpy(hist_data + istart, (data + bin + istart), sizeof(double) * (iend - istart));
|
||||
|
||||
for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) {
|
||||
const size_t bin = 2*thread_init_[i_bin_part]*nbins_;
|
||||
const size_t bin = 2 * thread_init_[i_bin_part] * nbins_;
|
||||
for (size_t i = istart; i < iend; i++) {
|
||||
hist_data[i] += data[bin + i];
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "row_set.h"
|
||||
#include "../tree/param.h"
|
||||
#include "./quantile.h"
|
||||
#include "../include/rabit/rabit.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -43,6 +44,10 @@ struct GHistEntry {
|
||||
sum_hess += e.sum_hess;
|
||||
}
|
||||
|
||||
inline static void Reduce(GHistEntry& a, const GHistEntry& b) { // NOLINT(*)
|
||||
a.Add(b);
|
||||
}
|
||||
|
||||
/*! \brief set sum to be difference of two GHistEntry's */
|
||||
inline void SetSubtract(const GHistEntry& a, const GHistEntry& b) {
|
||||
sum_grad = a.sum_grad - b.sum_grad;
|
||||
@@ -166,7 +171,7 @@ class GHistIndexBlockMatrix {
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief histogram of graident statistics for a single node.
|
||||
* \brief histogram of gradient statistics for a single node.
|
||||
* Consists of multiple GHistEntry's, each entry showing total graident statistics
|
||||
* for that particular bin
|
||||
* Uses global bin id so as to represent all features simultaneously
|
||||
@@ -254,6 +259,10 @@ class GHistBuilder {
|
||||
// construct a histogram via subtraction trick
|
||||
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
|
||||
|
||||
uint32_t GetNumBins() {
|
||||
return nbins_;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief number of threads for parallel computation */
|
||||
size_t nthread_;
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
|
||||
#include "io.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
/*!
|
||||
|
||||
@@ -598,8 +598,8 @@ class LearnerImpl : public Learner {
|
||||
}
|
||||
|
||||
const TreeMethod current_tree_method = tparam_.tree_method;
|
||||
|
||||
if (rabit::IsDistributed()) {
|
||||
/* Choose tree_method='approx' when distributed training is activated */
|
||||
CHECK(tparam_.dsplit != DataSplitMode::kAuto)
|
||||
<< "Precondition violated; dsplit cannot be 'auto' in distributed mode";
|
||||
if (tparam_.dsplit == DataSplitMode::kCol) {
|
||||
@@ -614,14 +614,13 @@ class LearnerImpl : public Learner {
|
||||
"for distributed training.";
|
||||
break;
|
||||
case TreeMethod::kApprox:
|
||||
case TreeMethod::kHist:
|
||||
// things are okay, do nothing
|
||||
break;
|
||||
case TreeMethod::kExact:
|
||||
case TreeMethod::kHist:
|
||||
LOG(WARNING) << "Tree method was set to be '"
|
||||
<< (current_tree_method == TreeMethod::kExact ?
|
||||
"exact" : "hist")
|
||||
<< "', but only 'approx' is available for distributed "
|
||||
LOG(CONSOLE) << "Tree method was set to be "
|
||||
<< "exact"
|
||||
<< "', but only 'approx' and 'hist' is available for distributed "
|
||||
"training. The `tree_method` parameter is now being "
|
||||
"changed to 'approx'";
|
||||
break;
|
||||
@@ -633,7 +632,15 @@ class LearnerImpl : public Learner {
|
||||
LOG(FATAL) << "Unknown tree_method ("
|
||||
<< static_cast<int>(current_tree_method) << ") detected";
|
||||
}
|
||||
tparam_.tree_method = TreeMethod::kApprox;
|
||||
if (current_tree_method != TreeMethod::kHist) {
|
||||
LOG(CONSOLE) << "Tree method is automatically selected to be 'approx'"
|
||||
" for distributed training.";
|
||||
tparam_.tree_method = TreeMethod::kApprox;
|
||||
} else {
|
||||
LOG(CONSOLE) << "Tree method is specified to be 'hist'"
|
||||
" for distributed training.";
|
||||
tparam_.tree_method = TreeMethod::kHist;
|
||||
}
|
||||
} else if (!p_train->SingleColBlock()) {
|
||||
/* Some tree methods are not available for external-memory DMatrix */
|
||||
switch (current_tree_method) {
|
||||
|
||||
@@ -126,6 +126,7 @@ class HistMaker: public BaseMaker {
|
||||
virtual void Update(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
RegTree *p_tree) {
|
||||
CHECK(param_.max_depth > 0) << "max_depth must be larger than 0";
|
||||
this->InitData(gpair, *p_fmat, *p_tree);
|
||||
this->InitWorkSet(p_fmat, *p_tree, &fwork_set_);
|
||||
// mark root node as fresh.
|
||||
@@ -345,10 +346,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
this->wspace_.Init(this->param_, 1);
|
||||
// if it is C++11, use lazy evaluation for Allreduce,
|
||||
// to gain speedup in recovery
|
||||
#if __cplusplus >= 201103L
|
||||
auto lazy_get_hist = [&]()
|
||||
#endif
|
||||
{
|
||||
auto lazy_get_hist = [&]() {
|
||||
thread_hist_.resize(omp_get_max_threads());
|
||||
// start accumulating statistics
|
||||
for (const auto &batch : p_fmat->GetSortedColumnBatches()) {
|
||||
@@ -371,22 +369,18 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const int nid = this->qexpand_[i];
|
||||
const int wid = this->node2workindex_[nid];
|
||||
this->wspace_.hset[0][fset.size() + wid * (fset.size()+1)]
|
||||
.data[0] = node_stats_[nid];
|
||||
this->wspace_.hset[0][fset.size() + wid * (fset.size() + 1)]
|
||||
.data[0] = node_stats_[nid];
|
||||
}
|
||||
};
|
||||
// sync the histogram
|
||||
// if it is C++11, use lazy evaluation for Allreduce
|
||||
#if __cplusplus >= 201103L
|
||||
this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data),
|
||||
this->wspace_.hset[0].data.size(), lazy_get_hist);
|
||||
#else
|
||||
this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data),
|
||||
this->wspace_.hset[0].data.size());
|
||||
#endif
|
||||
this->wspace_.hset[0].data.size(), lazy_get_hist);
|
||||
}
|
||||
|
||||
void ResetPositionAfterSplit(DMatrix *p_fmat,
|
||||
const RegTree &tree) override {
|
||||
const RegTree &tree) override {
|
||||
this->GetSplitSet(this->qexpand_, tree, &fsplit_set_);
|
||||
}
|
||||
void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
|
||||
|
||||
@@ -156,12 +156,18 @@ void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat,
|
||||
const int cright = (*p_tree)[nid].RightChild();
|
||||
hist_.AddHistRow(cleft);
|
||||
hist_.AddHistRow(cright);
|
||||
if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
|
||||
if (rabit::IsDistributed()) {
|
||||
// in distributed mode, we need to keep consistent across workers
|
||||
BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
|
||||
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
|
||||
} else {
|
||||
BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
|
||||
SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
|
||||
if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
|
||||
BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
|
||||
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
|
||||
} else {
|
||||
BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
|
||||
SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
|
||||
}
|
||||
}
|
||||
time_build_hist += dmlc::GetTime() - tstart;
|
||||
|
||||
@@ -617,23 +623,34 @@ void QuantileHistMaker::Builder::InitNewNode(int nid,
|
||||
|
||||
{
|
||||
auto& stats = snode_[nid].stats;
|
||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
||||
/* specialized code for dense data
|
||||
For dense data (with no missing value),
|
||||
the sum of gradient histogram is equal to snode[nid] */
|
||||
GHistRow hist = hist_[nid];
|
||||
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||
|
||||
const uint32_t ibegin = row_ptr[fid_least_bins_];
|
||||
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
|
||||
for (uint32_t i = ibegin; i < iend; ++i) {
|
||||
const GHistEntry et = hist.begin[i];
|
||||
stats.Add(et.sum_grad, et.sum_hess);
|
||||
GHistRow hist = hist_[nid];
|
||||
if (rabit::IsDistributed()) {
|
||||
// in distributed mode, the node's stats should be calculated from histogram, otherwise,
|
||||
// we will have wrong results in EnumerateSplit()
|
||||
// here we take the last feature in cut
|
||||
for (size_t i = gmat.cut.row_ptr[0]; i < gmat.cut.row_ptr[1]; i++) {
|
||||
stats.Add(hist.begin[i].sum_grad, hist.begin[i].sum_hess);
|
||||
}
|
||||
} else {
|
||||
const RowSetCollection::Elem e = row_set_collection_[nid];
|
||||
for (const size_t* it = e.begin; it < e.end; ++it) {
|
||||
stats.Add(gpair[*it]);
|
||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased ||
|
||||
rabit::IsDistributed()) {
|
||||
/* specialized code for dense data
|
||||
For dense data (with no missing value),
|
||||
the sum of gradient histogram is equal to snode[nid]
|
||||
GHistRow hist = hist_[nid];*/
|
||||
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||
|
||||
const uint32_t ibegin = row_ptr[fid_least_bins_];
|
||||
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
|
||||
for (uint32_t i = ibegin; i < iend; ++i) {
|
||||
const GHistEntry et = hist.begin[i];
|
||||
stats.Add(et.sum_grad, et.sum_hess);
|
||||
}
|
||||
} else {
|
||||
const RowSetCollection::Elem e = row_set_collection_[nid];
|
||||
for (const size_t* it = e.begin; it < e.end; ++it) {
|
||||
stats.Add(gpair[*it]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,6 +105,7 @@ class QuantileHistMaker: public TreeUpdater {
|
||||
} else {
|
||||
hist_builder_.BuildHist(gpair, row_indices, gmat, hist);
|
||||
}
|
||||
this->histred_.Allreduce(hist.begin, hist_builder_.GetNumBins());
|
||||
}
|
||||
|
||||
inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
|
||||
@@ -225,6 +226,8 @@ class QuantileHistMaker: public TreeUpdater {
|
||||
|
||||
enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
|
||||
DataLayout data_layout_;
|
||||
|
||||
rabit::Reducer<GHistEntry, GHistEntry::Reduce> histred_;
|
||||
};
|
||||
|
||||
std::unique_ptr<Builder> builder_;
|
||||
|
||||
@@ -52,10 +52,7 @@ class TreeRefresher: public TreeUpdater {
|
||||
}
|
||||
// if it is C++11, use lazy evaluation for Allreduce,
|
||||
// to gain speedup in recovery
|
||||
#if __cplusplus >= 201103L
|
||||
auto lazy_get_stats = [&]()
|
||||
#endif
|
||||
{
|
||||
auto lazy_get_stats = [&]() {
|
||||
const MetaInfo &info = p_fmat->Info();
|
||||
// start accumulating statistics
|
||||
for (const auto &batch : p_fmat->GetRowBatches()) {
|
||||
@@ -86,11 +83,7 @@ class TreeRefresher: public TreeUpdater {
|
||||
}
|
||||
}
|
||||
};
|
||||
#if __cplusplus >= 201103L
|
||||
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
|
||||
#else
|
||||
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size());
|
||||
#endif
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
|
||||
Reference in New Issue
Block a user