Improve multi-threaded performance (#2104)

* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
2017-03-25 10:35:01 -07:00
parent 332aea26a3
commit 14fba01b5a
14 changed files with 719 additions and 171 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -8,6 +8,7 @@
 #include <vector>
 #include "./sync.h"
 #include "./hist_util.h"
+#include "./column_matrix.h"
 #include "./quantile.h"

 namespace xgboost {
@@ -21,12 +22,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
  const int kFactor = 8;
  std::vector<WXQSketch> sketchs;

-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
-  nthread = std::max(nthread / 2, 1);
+  const int nthread = omp_get_max_threads();

  unsigned nstep = (info.num_col + nthread - 1) / nthread;
  unsigned ncol = static_cast<unsigned>(info.num_col);
@@ -105,18 +101,14 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
  }
 }

-
 void GHistIndexMatrix::Init(DMatrix* p_fmat) {
  CHECK(cut != nullptr);
  dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
-  hit_count.resize(cut->row_ptr.back(), 0);

-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
-  nthread = std::max(nthread / 2, 1);
+  const int nthread = omp_get_max_threads();
+  const unsigned nbins = cut->row_ptr.back();
+  hit_count.resize(nbins, 0);
+  hit_count_tloc_.resize(nthread * nbins, 0);

  iter->BeforeFirst();
  row_ptr.push_back(0);
@@ -134,6 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
    omp_ulong bsize = static_cast<omp_ulong>(batch.size);
    #pragma omp parallel for num_threads(nthread) schedule(static)
    for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
+      const int tid = omp_get_thread_num();
      size_t ibegin = row_ptr[rbegin + i];
      size_t iend = row_ptr[rbegin + i + 1];
      RowBatch::Inst inst = batch[i];
@@ -147,20 +140,28 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
        if (it == cend) it = cend - 1;
        unsigned idx = static_cast<unsigned>(it - cut->cut.begin());
        index[ibegin + j] = idx;
+        ++hit_count_tloc_[tid * nbins + idx];
      }
      std::sort(index.begin() + ibegin, index.begin() + iend);
    }
+
+    #pragma omp parallel for num_threads(nthread) schedule(static)
+    for (omp_ulong idx = 0; idx < nbins; ++idx) {
+      for (int tid = 0; tid < nthread; ++tid) {
+        hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
+      }
+    }
  }
 }

 void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
+                             const std::vector<bst_uint>& feat_set,
                             GHistRow hist) {
-  CHECK(!data_.empty()) << "GHistBuilder must be initialized";
-  CHECK_EQ(data_.size(), nbins_ * nthread_) << "invalid dimensions for temp buffer";
-
+  data_.resize(nbins_ * nthread_, GHistEntry());
  std::fill(data_.begin(), data_.end(), GHistEntry());
+  stat_buf_.resize(row_indices.size());

  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
@@ -169,21 +170,42 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,

  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
-    const bst_omp_uint tid = omp_get_thread_num();
-    const size_t off = tid * nbins_;
    bst_uint rid[K];
    bst_gpair stat[K];
-    size_t ibegin[K], iend[K];
    for (int k = 0; k < K; ++k) {
      rid[k] = row_indices.begin[i + k];
    }
    for (int k = 0; k < K; ++k) {
      stat[k] = gpair[rid[k]];
    }
+    for (int k = 0; k < K; ++k) {
+      stat_buf_[i + k] = stat[k];
+    }
+  }
+  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+    const bst_uint rid = row_indices.begin[i];
+    const bst_gpair stat = gpair[rid];
+    stat_buf_[i] = stat;
+  }
+
+  #pragma omp parallel for num_threads(nthread) schedule(dynamic)
+  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
+    const bst_omp_uint tid = omp_get_thread_num();
+    const size_t off = tid * nbins_;
+    bst_uint rid[K];
+    size_t ibegin[K];
+    size_t iend[K];
+    bst_gpair stat[K];
+    for (int k = 0; k < K; ++k) {
+      rid[k] = row_indices.begin[i + k];
+    }
    for (int k = 0; k < K; ++k) {
      ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
      iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
    }
+    for (int k = 0; k < K; ++k) {
+      stat[k] = stat_buf_[i + k];
+    }
    for (int k = 0; k < K; ++k) {
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
        const size_t bin = gmat.index[j];
@@ -193,9 +215,9 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  }
  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
    const bst_uint rid = row_indices.begin[i];
-    const bst_gpair stat = gpair[rid];
    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    const bst_gpair stat = stat_buf_[i];
    for (size_t j = ibegin; j < iend; ++j) {
      const size_t bin = gmat.index[j];
      data_[bin].Add(stat);
@@ -212,13 +234,26 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  }
 }

-void GHistBuilder::SubtractionTrick(GHistRow self,
-                                    GHistRow sibling,
-                                    GHistRow parent) {
+void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
  const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
+  const int K = 8;
+  const bst_omp_uint rest = nbins % K;
  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
+  for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
+    GHistEntry pb[K];
+    GHistEntry sb[K];
+    for (int k = 0; k < K; ++k) {
+      pb[k] = parent.begin[bin_id + k];
+    }
+    for (int k = 0; k < K; ++k) {
+      sb[k] = sibling.begin[bin_id + k];
+    }
+    for (int k = 0; k < K; ++k) {
+      self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
+    }
+  }
+  for (bst_omp_uint bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
    self.begin[bin_id].SetSubtract(parent.begin[bin_id], sibling.begin[bin_id]);
  }
 }