Improve multi-threaded performance (#2104)

* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
2017-03-25 10:35:01 -07:00
parent 332aea26a3
commit 14fba01b5a
14 changed files with 719 additions and 171 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -0,0 +1,231 @@
+/*!
+ * Copyright 2017 by Contributors
+ * \file column_matrix.h
+ * \brief Utility for fast column-wise access
+ * \author Philip Cho
+ */
+
+#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
+#define XGBOOST_COMMON_COLUMN_MATRIX_H_
+
+#define XGBOOST_TYPE_SWITCH(dtype, OP)        \
+switch (dtype) {                \
+  case xgboost::common::uint32 : {           \
+    typedef uint32_t DType;         \
+    OP; break;              \
+  }               \
+  case xgboost::common::uint16 : {           \
+    typedef uint16_t DType;         \
+    OP; break;              \
+  }               \
+  case xgboost::common::uint8 : {            \
+    typedef uint8_t DType;          \
+    OP; break;              \
+    default: LOG(FATAL) << "don't recognize type flag" << dtype;  \
+  }               \
+}
+
+#include <type_traits>
+#include <limits>
+#include <vector>
+#include "hist_util.h"
+
+namespace xgboost {
+namespace common {
+
+/*! \brief indicator of data type used for storing bin id's in a column. */
+enum DataType {
+  uint8 = 1,
+  uint16 = 2,
+  uint32 = 4
+};
+
+/*! \brief column type */
+enum ColumnType {
+  kDenseColumn,
+  kSparseColumn
+};
+
+/*! \brief a column storage, to be used with ApplySplit. Note that each
+    bin id is stored as index[i] + index_base. */
+template<typename T>
+class Column {
+ public:
+  ColumnType type;
+  const T* index;
+  uint32_t index_base;
+  const uint32_t* row_ind;
+  size_t len;
+};
+
+/*! \brief a collection of columns, with support for construction from
+    GHistIndexMatrix. */
+class ColumnMatrix {
+ public:
+  // get number of features
+  inline uint32_t GetNumFeature() const {
+    return type_.size();
+  }
+
+  // construct column matrix from GHistIndexMatrix
+  inline void Init(const GHistIndexMatrix& gmat, DataType dtype) {
+    this->dtype = dtype;
+    /* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
+       slot of internal buffer. */
+    packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
+
+    const uint32_t nfeature = gmat.cut->row_ptr.size() - 1;
+    const omp_ulong nrow = static_cast<omp_ulong>(gmat.row_ptr.size() - 1);
+
+    // identify type of each column
+    feature_counts_.resize(nfeature);
+    type_.resize(nfeature);
+    std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
+
+    uint32_t max_val = 0;
+    XGBOOST_TYPE_SWITCH(this->dtype, {
+      max_val = static_cast<uint32_t>(std::numeric_limits<DType>::max());
+    });
+    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+      CHECK_LE(gmat.cut->row_ptr[fid + 1] - gmat.cut->row_ptr[fid], max_val);
+    }
+
+    gmat.GetFeatureCounts(&feature_counts_[0]);
+    // classify features
+    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+      if (static_cast<double>(feature_counts_[fid]) < 0.5*nrow) {
+        type_[fid] = kSparseColumn;
+      } else {
+        type_[fid] = kDenseColumn;
+      }
+    }
+
+    // want to compute storage boundary for each feature
+    // using variants of prefix sum scan
+    boundary_.resize(nfeature);
+    bst_uint accum_index_ = 0;
+    bst_uint accum_row_ind_ = 0;
+    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+      boundary_[fid].index_begin = accum_index_;
+      boundary_[fid].row_ind_begin = accum_row_ind_;
+      if (type_[fid] == kDenseColumn) {
+        accum_index_ += nrow;
+      } else {
+        accum_index_ += feature_counts_[fid];
+        accum_row_ind_ += feature_counts_[fid];
+      }
+      boundary_[fid].index_end = accum_index_;
+      boundary_[fid].row_ind_end = accum_row_ind_;
+    }
+
+    index_.resize((boundary_[nfeature - 1].index_end
+                   + (packing_factor_ - 1)) / packing_factor_);
+    row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
+
+    // store least bin id for each feature
+    index_base_.resize(nfeature);
+    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+      index_base_[fid] = gmat.cut->row_ptr[fid];
+    }
+
+    // fill index_ for dense columns
+    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+      if (type_[fid] == kDenseColumn) {
+        const uint32_t ibegin = boundary_[fid].index_begin;
+        XGBOOST_TYPE_SWITCH(this->dtype, {
+          const size_t block_offset = ibegin / packing_factor_;
+          const size_t elem_offset = ibegin % packing_factor_;
+          DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+          DType* end = begin + nrow;
+          std::fill(begin, end, std::numeric_limits<DType>::max());
+            // max() indicates missing values
+        });
+      }
+    }
+
+    // loop over all rows and fill column entries
+    // num_nonzeros[fid] = how many nonzeros have this feature accumulated so far?
+    std::vector<uint32_t> num_nonzeros;
+    num_nonzeros.resize(nfeature);
+    std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
+    for (uint32_t rid = 0; rid < nrow; ++rid) {
+      const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+      const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+      size_t fid = 0;
+      for (size_t i = ibegin; i < iend; ++i) {
+        const size_t bin_id = gmat.index[i];
+        while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
+          ++fid;
+        }
+        if (type_[fid] == kDenseColumn) {
+          XGBOOST_TYPE_SWITCH(this->dtype, {
+            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+            begin[rid] = bin_id - index_base_[fid];
+          });
+        } else {
+          XGBOOST_TYPE_SWITCH(this->dtype, {
+            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+            begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
+          });
+          row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
+          ++num_nonzeros[fid];
+        }
+      }
+    }
+  }
+
+  /* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
+     to determine type of bin id's */
+  template<typename T>
+  inline Column<T> GetColumn(unsigned fid) const {
+    const bool valid_type = std::is_same<T, uint32_t>::value
+                          || std::is_same<T, uint16_t>::value
+                          || std::is_same<T, uint8_t>::value;
+    CHECK(valid_type);
+
+    Column<T> c;
+
+    c.type = type_[fid];
+    const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+    const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+    c.index = reinterpret_cast<const T*>(&index_[block_offset]) + elem_offset;
+    c.index_base = index_base_[fid];
+    c.row_ind = &row_ind_[boundary_[fid].row_ind_begin];
+    c.len = boundary_[fid].index_end - boundary_[fid].index_begin;
+
+    return c;
+  }
+
+ public:
+  DataType dtype;
+
+ private:
+  struct ColumnBoundary {
+    // indicate where each column's index and row_ind is stored.
+    // index_begin and index_end are logical offsets, so they should be converted to
+    // actual offsets by scaling with packing_factor_
+    unsigned index_begin;
+    unsigned index_end;
+    unsigned row_ind_begin;
+    unsigned row_ind_end;
+  };
+
+  std::vector<bst_uint> feature_counts_;
+  std::vector<ColumnType> type_;
+  std::vector<uint32_t> index_;  // index_: may store smaller integers; needs padding
+  std::vector<uint32_t> row_ind_;
+  std::vector<ColumnBoundary> boundary_;
+
+  size_t packing_factor_;  // how many integers are stored in each slot of index_
+
+  // index_base_[fid]: least bin id for feature fid
+  std::vector<uint32_t> index_base_;
+};
+
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -8,6 +8,7 @@
 #include <vector>
 #include "./sync.h"
 #include "./hist_util.h"
+#include "./column_matrix.h"
 #include "./quantile.h"

 namespace xgboost {
@@ -21,12 +22,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
  const int kFactor = 8;
  std::vector<WXQSketch> sketchs;

-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
-  nthread = std::max(nthread / 2, 1);
+  const int nthread = omp_get_max_threads();

  unsigned nstep = (info.num_col + nthread - 1) / nthread;
  unsigned ncol = static_cast<unsigned>(info.num_col);
@@ -105,18 +101,14 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
  }
 }

-
 void GHistIndexMatrix::Init(DMatrix* p_fmat) {
  CHECK(cut != nullptr);
  dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
-  hit_count.resize(cut->row_ptr.back(), 0);

-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
-  nthread = std::max(nthread / 2, 1);
+  const int nthread = omp_get_max_threads();
+  const unsigned nbins = cut->row_ptr.back();
+  hit_count.resize(nbins, 0);
+  hit_count_tloc_.resize(nthread * nbins, 0);

  iter->BeforeFirst();
  row_ptr.push_back(0);
@@ -134,6 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
    omp_ulong bsize = static_cast<omp_ulong>(batch.size);
    #pragma omp parallel for num_threads(nthread) schedule(static)
    for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
+      const int tid = omp_get_thread_num();
      size_t ibegin = row_ptr[rbegin + i];
      size_t iend = row_ptr[rbegin + i + 1];
      RowBatch::Inst inst = batch[i];
@@ -147,20 +140,28 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
        if (it == cend) it = cend - 1;
        unsigned idx = static_cast<unsigned>(it - cut->cut.begin());
        index[ibegin + j] = idx;
+        ++hit_count_tloc_[tid * nbins + idx];
      }
      std::sort(index.begin() + ibegin, index.begin() + iend);
    }
+
+    #pragma omp parallel for num_threads(nthread) schedule(static)
+    for (omp_ulong idx = 0; idx < nbins; ++idx) {
+      for (int tid = 0; tid < nthread; ++tid) {
+        hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
+      }
+    }
  }
 }

 void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
+                             const std::vector<bst_uint>& feat_set,
                             GHistRow hist) {
-  CHECK(!data_.empty()) << "GHistBuilder must be initialized";
-  CHECK_EQ(data_.size(), nbins_ * nthread_) << "invalid dimensions for temp buffer";
-
+  data_.resize(nbins_ * nthread_, GHistEntry());
  std::fill(data_.begin(), data_.end(), GHistEntry());
+  stat_buf_.resize(row_indices.size());

  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
@@ -169,21 +170,42 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,

  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
-    const bst_omp_uint tid = omp_get_thread_num();
-    const size_t off = tid * nbins_;
    bst_uint rid[K];
    bst_gpair stat[K];
-    size_t ibegin[K], iend[K];
    for (int k = 0; k < K; ++k) {
      rid[k] = row_indices.begin[i + k];
    }
    for (int k = 0; k < K; ++k) {
      stat[k] = gpair[rid[k]];
    }
+    for (int k = 0; k < K; ++k) {
+      stat_buf_[i + k] = stat[k];
+    }
+  }
+  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+    const bst_uint rid = row_indices.begin[i];
+    const bst_gpair stat = gpair[rid];
+    stat_buf_[i] = stat;
+  }
+
+  #pragma omp parallel for num_threads(nthread) schedule(dynamic)
+  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
+    const bst_omp_uint tid = omp_get_thread_num();
+    const size_t off = tid * nbins_;
+    bst_uint rid[K];
+    size_t ibegin[K];
+    size_t iend[K];
+    bst_gpair stat[K];
+    for (int k = 0; k < K; ++k) {
+      rid[k] = row_indices.begin[i + k];
+    }
    for (int k = 0; k < K; ++k) {
      ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
      iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
    }
+    for (int k = 0; k < K; ++k) {
+      stat[k] = stat_buf_[i + k];
+    }
    for (int k = 0; k < K; ++k) {
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
        const size_t bin = gmat.index[j];
@@ -193,9 +215,9 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  }
  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
    const bst_uint rid = row_indices.begin[i];
-    const bst_gpair stat = gpair[rid];
    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    const bst_gpair stat = stat_buf_[i];
    for (size_t j = ibegin; j < iend; ++j) {
      const size_t bin = gmat.index[j];
      data_[bin].Add(stat);
@@ -212,13 +234,26 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  }
 }

-void GHistBuilder::SubtractionTrick(GHistRow self,
-                                    GHistRow sibling,
-                                    GHistRow parent) {
+void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
  const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
+  const int K = 8;
+  const bst_omp_uint rest = nbins % K;
  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
+  for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
+    GHistEntry pb[K];
+    GHistEntry sb[K];
+    for (int k = 0; k < K; ++k) {
+      pb[k] = parent.begin[bin_id + k];
+    }
+    for (int k = 0; k < K; ++k) {
+      sb[k] = sibling.begin[bin_id + k];
+    }
+    for (int k = 0; k < K; ++k) {
+      self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
+    }
+  }
+  for (bst_omp_uint bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
    self.begin[bin_id].SetSubtract(parent.begin[bin_id], sibling.begin[bin_id]);
  }
 }
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -102,18 +102,27 @@ struct GHistIndexMatrix {
  std::vector<unsigned> index;
  /*! \brief hit count of each index */
  std::vector<unsigned> hit_count;
-  /*! \brief optional remap index from outter row_id -> internal row_id*/
-  std::vector<unsigned> remap_index;
  /*! \brief The corresponding cuts */
  const HistCutMatrix* cut;
  // Create a global histogram matrix, given cut
  void Init(DMatrix* p_fmat);
-  // build remap
-  void Remap();
  // get i-th row
  inline GHistIndexRow operator[](bst_uint i) const {
    return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
  }
+  inline void GetFeatureCounts(bst_uint* counts) const {
+    const unsigned nfeature = cut->row_ptr.size() - 1;
+    for (unsigned fid = 0; fid < nfeature; ++fid) {
+      const unsigned ibegin = cut->row_ptr[fid];
+      const unsigned iend = cut->row_ptr[fid + 1];
+      for (unsigned i = ibegin; i < iend; ++i) {
+        counts[fid] += hit_count[i];
+      }
+    }
+  }
+
+ private:
+  std::vector<unsigned> hit_count_tloc_;
 };

 /*!
@@ -189,13 +198,13 @@ class GHistBuilder {
  inline void Init(size_t nthread, size_t nbins) {
    nthread_ = nthread;
    nbins_ = nbins;
-    data_.resize(nthread * nbins, GHistEntry());
  }

  // construct a histogram via histogram aggregation
  void BuildHist(const std::vector<bst_gpair>& gpair,
                 const RowSetCollection::Elem row_indices,
                 const GHistIndexMatrix& gmat,
+                 const std::vector<bst_uint>& feat_set,
                 GHistRow hist);
  // construct a histogram via subtraction trick
  void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
@@ -206,6 +215,7 @@ class GHistBuilder {
  /*! \brief number of all bins over all features */
  size_t nbins_;
  std::vector<GHistEntry> data_;
+  std::vector<bst_gpair> stat_buf_;
 };


--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -17,15 +17,20 @@ namespace common {
 /*! \brief collection of rowset */
 class RowSetCollection {
 public:
-  /*! \brief subset of rows */
+  /*! \brief data structure to store an instance set, a subset of
+   *  rows (instances) associated with a particular node in a decision
+   *  tree. */
  struct Elem {
    const bst_uint* begin;
    const bst_uint* end;
+    int node_id;
+      // id of node associated with this instance set; -1 means uninitialized
    Elem(void)
-        : begin(nullptr), end(nullptr) {}
+        : begin(nullptr), end(nullptr), node_id(-1) {}
    Elem(const bst_uint* begin,
-         const bst_uint* end)
-        : begin(begin), end(end) {}
+         const bst_uint* end,
+         int node_id)
+        : begin(begin), end(end), node_id(node_id) {}

    inline size_t size() const {
      return end - begin;
@@ -36,6 +41,15 @@ class RowSetCollection {
    std::vector<bst_uint> left;
    std::vector<bst_uint> right;
  };
+
+  inline std::vector<Elem>::const_iterator begin() const {
+    return elem_of_each_node_.begin();
+  }
+
+  inline std::vector<Elem>::const_iterator end() const {
+    return elem_of_each_node_.end();
+  }
+
  /*! \brief return corresponding element set given the node_id */
  inline const Elem& operator[](unsigned node_id) const {
    const Elem& e = elem_of_each_node_[node_id];
@@ -53,7 +67,7 @@ class RowSetCollection {
    CHECK_EQ(elem_of_each_node_.size(), 0U);
    const bst_uint* begin = dmlc::BeginPtr(row_indices_);
    const bst_uint* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
-    elem_of_each_node_.emplace_back(Elem(begin, end));
+    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
  }
  // split rowset into two
  inline void AddSplit(unsigned node_id,
@@ -79,15 +93,15 @@ class RowSetCollection {
    }

    if (left_node_id >= elem_of_each_node_.size()) {
-      elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr));
+      elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
    }
    if (right_node_id >= elem_of_each_node_.size()) {
-      elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr));
+      elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
    }

-    elem_of_each_node_[left_node_id] = Elem(begin, split_pt);
-    elem_of_each_node_[right_node_id] = Elem(split_pt, e.end);
-    elem_of_each_node_[node_id] = Elem(nullptr, nullptr);
+    elem_of_each_node_[left_node_id] = Elem(begin, split_pt, left_node_id);
+    elem_of_each_node_[right_node_id] = Elem(split_pt, e.end, right_node_id);
+    elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
  }

  // stores the row indices in the set