Optimizations for RNG in InitData kernel (#5522)

* optimizations for subsampling in InitData * optimizations for subsampling in InitData Co-authored-by: SHVETS, KIRILL <kirill.shvets@intel.com>
2020-04-16 18:24:32 +03:00
parent e268fb0093
commit a2d86b8e4b
3 changed files with 110 additions and 10 deletions
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -536,6 +536,63 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache(
  return true;
 }

+void QuantileHistMaker::Builder::InitSampling(const std::vector<GradientPair>& gpair,
+                                                const DMatrix& fmat,
+                                                std::vector<size_t>* row_indices) {
+  const auto& info = fmat.Info();
+  auto& rnd = common::GlobalRandom();
+  std::vector<size_t>& row_indices_local = *row_indices;
+  size_t* p_row_indices = row_indices_local.data();
+#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+  std::bernoulli_distribution coin_flip(param_.subsample);
+  size_t j = 0;
+  for (size_t i = 0; i < info.num_row_; ++i) {
+    if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
+      p_row_indices[j++] = i;
+    }
+  }
+  /* resize row_indices to reduce memory */
+  row_indices_local.resize(j);
+#else
+  const size_t nthread = this->nthread_;
+  std::vector<size_t> row_offsets(nthread, 0);
+  /* usage of mt19937_64 give 2x speed up for subsampling */
+  std::vector<std::mt19937> rnds(nthread);
+  /* create engine for each thread */
+  for (std::mt19937& r : rnds) {
+    r = rnd;
+  }
+  const size_t discard_size = info.num_row_ / nthread;
+  #pragma omp parallel num_threads(nthread)
+  {
+    const size_t tid = omp_get_thread_num();
+    const size_t ibegin = tid * discard_size;
+    const size_t iend = (tid == (nthread - 1)) ?
+                        info.num_row_ : ibegin + discard_size;
+    std::bernoulli_distribution coin_flip(param_.subsample);
+
+    rnds[tid].discard(2*discard_size * tid);
+    for (size_t i = ibegin; i < iend; ++i) {
+      if (gpair[i].GetHess() >= 0.0f && coin_flip(rnds[tid])) {
+        p_row_indices[ibegin + row_offsets[tid]++] = i;
+      }
+    }
+  }
+  /* discard global engine */
+  rnd = rnds[nthread - 1];
+  size_t prefix_sum = row_offsets[0];
+  for (size_t i = 1; i < nthread; ++i) {
+    const size_t ibegin = i * discard_size;
+
+    for (size_t k = 0; k < row_offsets[i]; ++k) {
+      row_indices_local[prefix_sum + k] = row_indices_local[ibegin + k];
+    }
+    prefix_sum += row_offsets[i];
+  }
+  /* resize row_indices to reduce memory */
+  row_indices_local.resize(prefix_sum);
+#endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+}
 void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
                                          const std::vector<GradientPair>& gpair,
                                          const DMatrix& fmat,
@@ -569,22 +626,14 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,

    std::vector<size_t>& row_indices = *row_set_collection_.Data();
    row_indices.resize(info.num_row_);
-    auto* p_row_indices = row_indices.data();
+    size_t* p_row_indices = row_indices.data();
    // mark subsample and build list of member rows

    if (param_.subsample < 1.0f) {
      CHECK_EQ(param_.sampling_method, TrainParam::kUniform)
        << "Only uniform sampling is supported, "
        << "gradient-based sampling is only support by GPU Hist.";
-      std::bernoulli_distribution coin_flip(param_.subsample);
-      auto& rnd = common::GlobalRandom();
-      size_t j = 0;
-      for (size_t i = 0; i < info.num_row_; ++i) {
-        if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
-          p_row_indices[j++] = i;
-        }
-      }
-      row_indices.resize(j);
+      InitSampling(gpair, fmat, &row_indices);
    } else {
      MemStackAllocator<bool, 128> buff(this->nthread_);
      bool* p_buff = buff.Get();
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -202,6 +202,9 @@ class QuantileHistMaker: public TreeUpdater {
                  const DMatrix& fmat,
                  const RegTree& tree);

+    void InitSampling(const std::vector<GradientPair>& gpair,
+                      const DMatrix& fmat, std::vector<size_t>* row_indices);
+
    void EvaluateSplits(const std::vector<ExpandEntry>& nodes_set,
                        const GHistIndexMatrix& gmat,
                        const HistCollection& hist,