Improve multi-threaded performance (#2104)
* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
#include <vector>
|
||||
#include "./sync.h"
|
||||
#include "./hist_util.h"
|
||||
#include "./column_matrix.h"
|
||||
#include "./quantile.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -21,12 +22,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
|
||||
const int kFactor = 8;
|
||||
std::vector<WXQSketch> sketchs;
|
||||
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
nthread = std::max(nthread / 2, 1);
|
||||
const int nthread = omp_get_max_threads();
|
||||
|
||||
unsigned nstep = (info.num_col + nthread - 1) / nthread;
|
||||
unsigned ncol = static_cast<unsigned>(info.num_col);
|
||||
@@ -105,18 +101,14 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
CHECK(cut != nullptr);
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
hit_count.resize(cut->row_ptr.back(), 0);
|
||||
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
nthread = std::max(nthread / 2, 1);
|
||||
const int nthread = omp_get_max_threads();
|
||||
const unsigned nbins = cut->row_ptr.back();
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_tloc_.resize(nthread * nbins, 0);
|
||||
|
||||
iter->BeforeFirst();
|
||||
row_ptr.push_back(0);
|
||||
@@ -134,6 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
omp_ulong bsize = static_cast<omp_ulong>(batch.size);
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
|
||||
const int tid = omp_get_thread_num();
|
||||
size_t ibegin = row_ptr[rbegin + i];
|
||||
size_t iend = row_ptr[rbegin + i + 1];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
@@ -147,20 +140,28 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
if (it == cend) it = cend - 1;
|
||||
unsigned idx = static_cast<unsigned>(it - cut->cut.begin());
|
||||
index[ibegin + j] = idx;
|
||||
++hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
std::sort(index.begin() + ibegin, index.begin() + iend);
|
||||
}
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (omp_ulong idx = 0; idx < nbins; ++idx) {
|
||||
for (int tid = 0; tid < nthread; ++tid) {
|
||||
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist) {
|
||||
CHECK(!data_.empty()) << "GHistBuilder must be initialized";
|
||||
CHECK_EQ(data_.size(), nbins_ * nthread_) << "invalid dimensions for temp buffer";
|
||||
|
||||
data_.resize(nbins_ * nthread_, GHistEntry());
|
||||
std::fill(data_.begin(), data_.end(), GHistEntry());
|
||||
stat_buf_.resize(row_indices.size());
|
||||
|
||||
const int K = 8; // loop unrolling factor
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
@@ -169,21 +170,42 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
const bst_omp_uint tid = omp_get_thread_num();
|
||||
const size_t off = tid * nbins_;
|
||||
bst_uint rid[K];
|
||||
bst_gpair stat[K];
|
||||
size_t ibegin[K], iend[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
stat_buf_[i + k] = stat[k];
|
||||
}
|
||||
}
|
||||
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
|
||||
const bst_uint rid = row_indices.begin[i];
|
||||
const bst_gpair stat = gpair[rid];
|
||||
stat_buf_[i] = stat;
|
||||
}
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
const bst_omp_uint tid = omp_get_thread_num();
|
||||
const size_t off = tid * nbins_;
|
||||
bst_uint rid[K];
|
||||
size_t ibegin[K];
|
||||
size_t iend[K];
|
||||
bst_gpair stat[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
|
||||
iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
stat[k] = stat_buf_[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const size_t bin = gmat.index[j];
|
||||
@@ -193,9 +215,9 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
|
||||
const bst_uint rid = row_indices.begin[i];
|
||||
const bst_gpair stat = gpair[rid];
|
||||
const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
|
||||
const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
|
||||
const bst_gpair stat = stat_buf_[i];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const size_t bin = gmat.index[j];
|
||||
data_[bin].Add(stat);
|
||||
@@ -212,13 +234,26 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
}
|
||||
|
||||
void GHistBuilder::SubtractionTrick(GHistRow self,
|
||||
GHistRow sibling,
|
||||
GHistRow parent) {
|
||||
void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
|
||||
const int K = 8;
|
||||
const bst_omp_uint rest = nbins % K;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
|
||||
for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
|
||||
GHistEntry pb[K];
|
||||
GHistEntry sb[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
pb[k] = parent.begin[bin_id + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
sb[k] = sibling.begin[bin_id + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
|
||||
}
|
||||
}
|
||||
for (bst_omp_uint bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
|
||||
self.begin[bin_id].SetSubtract(parent.begin[bin_id], sibling.begin[bin_id]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user