Improve multi-threaded performance (#2104)

* Add UpdatePredictionCache() option to updaters

Some updaters (e.g. fast_hist) has enough information to quickly compute
prediction cache for the training data. Each updater may override
UpdaterPredictionCache() method to update the prediction cache. Note: this
trick does not apply to validation data.

* Respond to code review

* Disable some debug messages by default
* Document UpdatePredictionCache() interface
* Remove base_margin logic from UpdatePredictionCache() implementation
* Do not take pointer to cfg, as reference may get stale

* Improve multi-threaded performance

* Use columnwise accessor to accelerate ApplySplit() step,
  with support for a compressed representation
* Parallel sort for evaluation step
* Inline BuildHist() function
* Cache gradient pairs when building histograms in BuildHist()

* Add missing #if macro

* Respond to code review

* Use wrapper to enable parallel sort on Linux

* Fix C++ compatibility issues

* MSVC doesn't support unsigned in OpenMP loops
* gcc 4.6 doesn't support using keyword

* Fix lint issues

* Respond to code review

* Fix bug in ApplySplitSparseData()

* Attempting to read beyond the end of a sparse column
* Mishandling the case where an entire range of rows have missing values

* Fix training continuation bug

Disable UpdatePredictionCache() in the first iteration. This way, we can
accomodate the scenario where we build off of an existing (nonempty) ensemble.

* Add regression test for fast_hist

* Respond to code review

* Add back old version of ApplySplitSparseData
This commit is contained in:
Philip Cho
2017-03-25 10:35:01 -07:00
committed by Tianqi Chen
parent 332aea26a3
commit 14fba01b5a
14 changed files with 719 additions and 171 deletions

View File

@@ -8,6 +8,7 @@
#include <vector>
#include "./sync.h"
#include "./hist_util.h"
#include "./column_matrix.h"
#include "./quantile.h"
namespace xgboost {
@@ -21,12 +22,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
const int kFactor = 8;
std::vector<WXQSketch> sketchs;
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
nthread = std::max(nthread / 2, 1);
const int nthread = omp_get_max_threads();
unsigned nstep = (info.num_col + nthread - 1) / nthread;
unsigned ncol = static_cast<unsigned>(info.num_col);
@@ -105,18 +101,14 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
}
}
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
CHECK(cut != nullptr);
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
hit_count.resize(cut->row_ptr.back(), 0);
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
nthread = std::max(nthread / 2, 1);
const int nthread = omp_get_max_threads();
const unsigned nbins = cut->row_ptr.back();
hit_count.resize(nbins, 0);
hit_count_tloc_.resize(nthread * nbins, 0);
iter->BeforeFirst();
row_ptr.push_back(0);
@@ -134,6 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
omp_ulong bsize = static_cast<omp_ulong>(batch.size);
#pragma omp parallel for num_threads(nthread) schedule(static)
for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
const int tid = omp_get_thread_num();
size_t ibegin = row_ptr[rbegin + i];
size_t iend = row_ptr[rbegin + i + 1];
RowBatch::Inst inst = batch[i];
@@ -147,20 +140,28 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
if (it == cend) it = cend - 1;
unsigned idx = static_cast<unsigned>(it - cut->cut.begin());
index[ibegin + j] = idx;
++hit_count_tloc_[tid * nbins + idx];
}
std::sort(index.begin() + ibegin, index.begin() + iend);
}
#pragma omp parallel for num_threads(nthread) schedule(static)
for (omp_ulong idx = 0; idx < nbins; ++idx) {
for (int tid = 0; tid < nthread; ++tid) {
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
}
}
}
}
void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const std::vector<bst_uint>& feat_set,
GHistRow hist) {
CHECK(!data_.empty()) << "GHistBuilder must be initialized";
CHECK_EQ(data_.size(), nbins_ * nthread_) << "invalid dimensions for temp buffer";
data_.resize(nbins_ * nthread_, GHistEntry());
std::fill(data_.begin(), data_.end(), GHistEntry());
stat_buf_.resize(row_indices.size());
const int K = 8; // loop unrolling factor
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
@@ -169,21 +170,42 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
const bst_omp_uint tid = omp_get_thread_num();
const size_t off = tid * nbins_;
bst_uint rid[K];
bst_gpair stat[K];
size_t ibegin[K], iend[K];
for (int k = 0; k < K; ++k) {
rid[k] = row_indices.begin[i + k];
}
for (int k = 0; k < K; ++k) {
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < K; ++k) {
stat_buf_[i + k] = stat[k];
}
}
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
const bst_uint rid = row_indices.begin[i];
const bst_gpair stat = gpair[rid];
stat_buf_[i] = stat;
}
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
const bst_omp_uint tid = omp_get_thread_num();
const size_t off = tid * nbins_;
bst_uint rid[K];
size_t ibegin[K];
size_t iend[K];
bst_gpair stat[K];
for (int k = 0; k < K; ++k) {
rid[k] = row_indices.begin[i + k];
}
for (int k = 0; k < K; ++k) {
ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
}
for (int k = 0; k < K; ++k) {
stat[k] = stat_buf_[i + k];
}
for (int k = 0; k < K; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const size_t bin = gmat.index[j];
@@ -193,9 +215,9 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
}
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
const bst_uint rid = row_indices.begin[i];
const bst_gpair stat = gpair[rid];
const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
const bst_gpair stat = stat_buf_[i];
for (size_t j = ibegin; j < iend; ++j) {
const size_t bin = gmat.index[j];
data_[bin].Add(stat);
@@ -212,13 +234,26 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
}
}
void GHistBuilder::SubtractionTrick(GHistRow self,
GHistRow sibling,
GHistRow parent) {
void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
const int K = 8;
const bst_omp_uint rest = nbins % K;
#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
GHistEntry pb[K];
GHistEntry sb[K];
for (int k = 0; k < K; ++k) {
pb[k] = parent.begin[bin_id + k];
}
for (int k = 0; k < K; ++k) {
sb[k] = sibling.begin[bin_id + k];
}
for (int k = 0; k < K; ++k) {
self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
}
}
for (bst_omp_uint bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
self.begin[bin_id].SetSubtract(parent.begin[bin_id], sibling.begin[bin_id]);
}
}