Patch to improve multithreaded performance scaling (#2493)

* Patch to improve multithreaded performance scaling Change parallel strategy for histogram construction. Instead of partitioning data rows among multiple threads, partition feature columns instead. Useful heuristics for assigning partitions have been adopted from LightGBM project. * Add missing header to satisfy MSVC * Restore max_bin and related parameters to TrainParam * Fix lint error * inline functions do not require static keyword * Feature grouping algorithm accepting FastHistParam Feature grouping algorithm accepts many parameters (3+), and it gets annoying to pass them one by one. Instead, simply pass the reference to FastHistParam. The definition of FastHistParam has been moved to a separate header file to accomodate this change.
2017-07-07 08:25:07 -07:00
parent 6bfc472bec
commit ba820847f9
6 changed files with 466 additions and 52 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -5,10 +5,12 @@
 * \author Philip Cho, Tianqi Chen
 */
 #include <dmlc/omp.h>
+#include <numeric>
 #include <vector>
 #include "./sync.h"
-#include "./hist_util.h"
+#include "./random.h"
 #include "./column_matrix.h"
+#include "./hist_util.h"
 #include "./quantile.h"

 namespace xgboost {
@@ -154,6 +156,246 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
  }
 }

+template <typename T>
+static unsigned GetConflictCount(const std::vector<bool>& mark,
+                                 const Column<T>& column,
+                                 unsigned max_cnt) {
+  unsigned ret = 0;
+  if (column.type == xgboost::common::kDenseColumn) {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (column.index[i] != std::numeric_limits<T>::max() && mark[i]) {
+        ++ret;
+        if (ret > max_cnt) {
+          return max_cnt + 1;
+        }
+      }
+    }
+  } else {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (mark[column.row_ind[i]]) {
+        ++ret;
+        if (ret > max_cnt) {
+          return max_cnt + 1;
+        }
+      }
+    }
+  }
+  return ret;
+}
+
+template <typename T>
+inline void
+MarkUsed(std::vector<bool>* p_mark, const Column<T>& column) {
+  std::vector<bool>& mark = *p_mark;
+  if (column.type == xgboost::common::kDenseColumn) {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (column.index[i] != std::numeric_limits<T>::max()) {
+        mark[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < column.len; ++i) {
+      mark[column.row_ind[i]] = true;
+    }
+  }
+}
+
+template <typename T>
+inline std::vector<std::vector<unsigned>>
+FindGroups_(const std::vector<unsigned>& feature_list,
+            const std::vector<bst_uint>& feature_nnz,
+            const ColumnMatrix& colmat,
+            unsigned nrow,
+            const FastHistParam& param) {
+  /* Goal: Bundle features together that has little or no "overlap", i.e.
+           only a few data points should have nonzero values for
+           member features.
+           Note that one-hot encoded features will be grouped together. */
+
+  std::vector<std::vector<unsigned>> groups;
+  std::vector<std::vector<bool>> conflict_marks;
+  std::vector<unsigned> group_nnz;
+  std::vector<unsigned> group_conflict_cnt;
+  const unsigned max_conflict_cnt
+    = static_cast<unsigned>(param.max_conflict_rate * nrow);
+
+  for (auto fid : feature_list) {
+    const Column<T>& column = colmat.GetColumn<T>(fid);
+
+    const size_t cur_fid_nnz = feature_nnz[fid];
+    bool need_new_group = true;
+
+    // randomly choose some of existing groups as candidates
+    std::vector<unsigned> search_groups;
+    for (size_t gid = 0; gid < groups.size(); ++gid) {
+      if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
+        search_groups.push_back(gid);
+      }
+    }
+    std::shuffle(search_groups.begin(), search_groups.end(), common::GlobalRandom());
+    if (param.max_search_group > 0 && search_groups.size() > param.max_search_group) {
+      search_groups.resize(param.max_search_group);
+    }
+
+    // examine each candidate group: is it okay to insert fid?
+    for (auto gid : search_groups) {
+      const unsigned rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
+      const unsigned cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
+      if (cnt <= rest_max_cnt) {
+        need_new_group = false;
+        groups[gid].push_back(fid);
+        group_conflict_cnt[gid] += cnt;
+        group_nnz[gid] += cur_fid_nnz - cnt;
+        MarkUsed(&conflict_marks[gid], column);
+        break;
+      }
+    }
+
+    // create new group if necessary
+    if (need_new_group) {
+      groups.emplace_back();
+      groups.back().push_back(fid);
+      group_conflict_cnt.push_back(0);
+      conflict_marks.emplace_back(nrow, false);
+      MarkUsed(&conflict_marks.back(), column);
+      group_nnz.emplace_back(cur_fid_nnz);
+    }
+  }
+
+  return groups;
+}
+
+inline std::vector<std::vector<unsigned>>
+FindGroups(const std::vector<unsigned>& feature_list,
+           const std::vector<bst_uint>& feature_nnz,
+           const ColumnMatrix& colmat,
+           unsigned nrow,
+           const FastHistParam& param) {
+  XGBOOST_TYPE_SWITCH(colmat.dtype, {
+    return FindGroups_<DType>(feature_list, feature_nnz, colmat, nrow, param);
+  });
+  return std::vector<std::vector<unsigned>>();  // to avoid warning message
+}
+
+inline std::vector<std::vector<unsigned>>
+FastFeatureGrouping(const GHistIndexMatrix& gmat,
+                    const ColumnMatrix& colmat,
+                    const FastHistParam& param) {
+  const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nfeature = gmat.cut->row_ptr.size() - 1;
+
+  std::vector<unsigned> feature_list(nfeature);
+  std::iota(feature_list.begin(), feature_list.end(), 0);
+
+  // sort features by nonzero counts, descending order
+  std::vector<bst_uint> feature_nnz(nfeature);
+  std::vector<unsigned> features_by_nnz(feature_list);
+  gmat.GetFeatureCounts(&feature_nnz[0]);
+  std::sort(features_by_nnz.begin(), features_by_nnz.end(),
+            [&feature_nnz](int a, int b) {
+    return feature_nnz[a] > feature_nnz[b];
+  });
+
+  auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
+  auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;
+
+  // take apart small, sparse groups, as it won't help speed
+  {
+    std::vector<std::vector<unsigned>> ret;
+    for (const auto& group : groups) {
+      if (group.size() <= 1 || group.size() >= 5) {
+        ret.push_back(group);  // keep singleton groups and large (5+) groups
+      } else {
+        unsigned nnz = 0;
+        for (auto fid : group) {
+          nnz += feature_nnz[fid];
+        }
+        double nnz_rate = static_cast<double>(nnz) / nrow;
+        // take apart small sparse group, due it will not gain on speed
+        if (nnz_rate <= param.sparse_threshold) {
+          for (auto fid : group) {
+            ret.emplace_back();
+            ret.back().push_back(fid);
+          }
+        } else {
+          ret.push_back(group);
+        }
+      }
+    }
+    groups = std::move(ret);
+  }
+
+  // shuffle groups
+  std::shuffle(groups.begin(), groups.end(), common::GlobalRandom());
+
+  return groups;
+}
+
+void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
+                                 const ColumnMatrix& colmat,
+                                 const FastHistParam& param) {
+  cut = gmat.cut;
+
+  const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nbins = gmat.cut->row_ptr.back();
+
+  /* step 1: form feature groups */
+  auto groups = FastFeatureGrouping(gmat, colmat, param);
+  const size_t nblock = groups.size();
+
+  /* step 2: build a new CSR matrix for each feature group */
+  std::vector<unsigned> bin2block(nbins);  // lookup table [bin id] => [block id]
+  for (size_t group_id = 0; group_id < nblock; ++group_id) {
+    for (auto& fid : groups[group_id]) {
+      const unsigned bin_begin = gmat.cut->row_ptr[fid];
+      const unsigned bin_end = gmat.cut->row_ptr[fid + 1];
+      for (unsigned bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
+        bin2block[bin_id] = group_id;
+      }
+    }
+  }
+  std::vector<std::vector<unsigned>> index_temp(nblock);
+  std::vector<std::vector<unsigned>> row_ptr_temp(nblock);
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    row_ptr_temp[block_id].push_back(0);
+  }
+  for (size_t rid = 0; rid < nrow; ++rid) {
+    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    for (size_t j = ibegin; j < iend; ++j) {
+      const size_t bin_id = gmat.index[j];
+      const size_t block_id = bin2block[bin_id];
+      index_temp[block_id].push_back(bin_id);
+    }
+    for (size_t block_id = 0; block_id < nblock; ++block_id) {
+      row_ptr_temp[block_id].push_back(index_temp[block_id].size());
+    }
+  }
+
+  /* step 3: concatenate CSR matrices into one (index, row_ptr) pair */
+  std::vector<size_t> index_blk_ptr;
+  std::vector<size_t> row_ptr_blk_ptr;
+  index_blk_ptr.push_back(0);
+  row_ptr_blk_ptr.push_back(0);
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
+    row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
+    index_blk_ptr.push_back(index.size());
+    row_ptr_blk_ptr.push_back(row_ptr.size());
+  }
+
+  // save shortcut for each block
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    Block blk;
+    blk.index_begin = &index[index_blk_ptr[block_id]];
+    blk.row_ptr_begin = &row_ptr[row_ptr_blk_ptr[block_id]];
+    blk.index_end = &index[index_blk_ptr[block_id + 1]];
+    blk.row_ptr_end = &row_ptr[row_ptr_blk_ptr[block_id + 1]];
+    blocks.push_back(blk);
+  }
+}
+
 void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
@@ -161,33 +403,12 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                             GHistRow hist) {
  data_.resize(nbins_ * nthread_, GHistEntry());
  std::fill(data_.begin(), data_.end(), GHistEntry());
-  stat_buf_.resize(row_indices.size());

  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
  const bst_omp_uint rest = nrows % K;

-  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
-    bst_uint rid[K];
-    bst_gpair stat[K];
-    for (int k = 0; k < K; ++k) {
-      rid[k] = row_indices.begin[i + k];
-    }
-    for (int k = 0; k < K; ++k) {
-      stat[k] = gpair[rid[k]];
-    }
-    for (int k = 0; k < K; ++k) {
-      stat_buf_[i + k] = stat[k];
-    }
-  }
-  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
-    const bst_uint rid = row_indices.begin[i];
-    const bst_gpair stat = gpair[rid];
-    stat_buf_[i] = stat;
-  }
-
  #pragma omp parallel for num_threads(nthread) schedule(guided)
  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
    const bst_omp_uint tid = omp_get_thread_num();
@@ -204,7 +425,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
      iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
    }
    for (int k = 0; k < K; ++k) {
-      stat[k] = stat_buf_[i + k];
+      stat[k] = gpair[rid[k]];
    }
    for (int k = 0; k < K; ++k) {
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
@@ -217,7 +438,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
    const bst_uint rid = row_indices.begin[i];
    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
-    const bst_gpair stat = stat_buf_[i];
+    const bst_gpair stat = gpair[rid];
    for (size_t j = ibegin; j < iend; ++j) {
      const size_t bin = gmat.index[j];
      data_[bin].Add(stat);
@@ -234,10 +455,60 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  }
 }

+void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
+                                  const RowSetCollection::Elem row_indices,
+                                  const GHistIndexBlockMatrix& gmatb,
+                                  const std::vector<bst_uint>& feat_set,
+                                  GHistRow hist) {
+  const int K = 8;  // loop unrolling factor
+  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
+  const bst_omp_uint nblock = gmatb.GetNumBlock();
+  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
+  const bst_omp_uint rest = nrows % K;
+
+  #pragma omp parallel for num_threads(nthread) schedule(guided)
+  for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
+    auto gmat = gmatb[bid];
+
+    for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
+      bst_uint rid[K];
+      size_t ibegin[K];
+      size_t iend[K];
+      bst_gpair stat[K];
+      for (int k = 0; k < K; ++k) {
+        rid[k] = row_indices.begin[i + k];
+      }
+      for (int k = 0; k < K; ++k) {
+        ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
+        iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
+      }
+      for (int k = 0; k < K; ++k) {
+        stat[k] = gpair[rid[k]];
+      }
+      for (int k = 0; k < K; ++k) {
+        for (size_t j = ibegin[k]; j < iend[k]; ++j) {
+          const size_t bin = gmat.index[j];
+          hist.begin[bin].Add(stat[k]);
+        }
+      }
+    }
+    for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+      const bst_uint rid = row_indices.begin[i];
+      const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+      const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+      const bst_gpair stat = gpair[rid];
+      for (size_t j = ibegin; j < iend; ++j) {
+        const size_t bin = gmat.index[j];
+        hist.begin[bin].Add(stat);
+      }
+    }
+  }
+}
+
 void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
  const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
-  const int K = 8;
+  const int K = 8;  // loop unrolling factor
  const bst_omp_uint rest = nbins % K;
  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {