Optimizations for RNG in InitData kernel (#5522)

* optimizations for subsampling in InitData * optimizations for subsampling in InitData Co-authored-by: SHVETS, KIRILL <kirill.shvets@intel.com>
2020-04-16 18:24:32 +03:00 · 2020-04-16 18:24:32 +03:00 · a2d86b8e4b
commit a2d86b8e4b
parent e268fb0093
3 changed files with 110 additions and 10 deletions
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -536,6 +536,63 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache(
  return true;
 }

+void QuantileHistMaker::Builder::InitSampling(const std::vector<GradientPair>& gpair,
+                                                const DMatrix& fmat,
+                                                std::vector<size_t>* row_indices) {
+  const auto& info = fmat.Info();
+  auto& rnd = common::GlobalRandom();
+  std::vector<size_t>& row_indices_local = *row_indices;
+  size_t* p_row_indices = row_indices_local.data();
+#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+  std::bernoulli_distribution coin_flip(param_.subsample);
+  size_t j = 0;
+  for (size_t i = 0; i < info.num_row_; ++i) {
+    if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
+      p_row_indices[j++] = i;
+    }
+  }
+  /* resize row_indices to reduce memory */
+  row_indices_local.resize(j);
+#else
+  const size_t nthread = this->nthread_;
+  std::vector<size_t> row_offsets(nthread, 0);
+  /* usage of mt19937_64 give 2x speed up for subsampling */
+  std::vector<std::mt19937> rnds(nthread);
+  /* create engine for each thread */
+  for (std::mt19937& r : rnds) {
+    r = rnd;
+  }
+  const size_t discard_size = info.num_row_ / nthread;
+  #pragma omp parallel num_threads(nthread)
+  {
+    const size_t tid = omp_get_thread_num();
+    const size_t ibegin = tid * discard_size;
+    const size_t iend = (tid == (nthread - 1)) ?
+                        info.num_row_ : ibegin + discard_size;
+    std::bernoulli_distribution coin_flip(param_.subsample);
+
+    rnds[tid].discard(2*discard_size * tid);
+    for (size_t i = ibegin; i < iend; ++i) {
+      if (gpair[i].GetHess() >= 0.0f && coin_flip(rnds[tid])) {
+        p_row_indices[ibegin + row_offsets[tid]++] = i;
+      }
+    }
+  }
+  /* discard global engine */
+  rnd = rnds[nthread - 1];
+  size_t prefix_sum = row_offsets[0];
+  for (size_t i = 1; i < nthread; ++i) {
+    const size_t ibegin = i * discard_size;
+
+    for (size_t k = 0; k < row_offsets[i]; ++k) {
+      row_indices_local[prefix_sum + k] = row_indices_local[ibegin + k];
+    }
+    prefix_sum += row_offsets[i];
+  }
+  /* resize row_indices to reduce memory */
+  row_indices_local.resize(prefix_sum);
+#endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+}
 void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
                                          const std::vector<GradientPair>& gpair,
                                          const DMatrix& fmat,
@ -569,22 +626,14 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,

    std::vector<size_t>& row_indices = *row_set_collection_.Data();
    row_indices.resize(info.num_row_);
-    auto* p_row_indices = row_indices.data();
+    size_t* p_row_indices = row_indices.data();
    // mark subsample and build list of member rows

    if (param_.subsample < 1.0f) {
      CHECK_EQ(param_.sampling_method, TrainParam::kUniform)
        << "Only uniform sampling is supported, "
        << "gradient-based sampling is only support by GPU Hist.";
-      std::bernoulli_distribution coin_flip(param_.subsample);
-      auto& rnd = common::GlobalRandom();
-      size_t j = 0;
-      for (size_t i = 0; i < info.num_row_; ++i) {
-        if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
-          p_row_indices[j++] = i;
-        }
-      }
-      row_indices.resize(j);
+      InitSampling(gpair, fmat, &row_indices);
    } else {
      MemStackAllocator<bool, 128> buff(this->nthread_);
      bool* p_buff = buff.Get();
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@ -202,6 +202,9 @@ class QuantileHistMaker: public TreeUpdater {
                  const DMatrix& fmat,
                  const RegTree& tree);

+    void InitSampling(const std::vector<GradientPair>& gpair,
+                      const DMatrix& fmat, std::vector<size_t>* row_indices);
+
    void EvaluateSplits(const std::vector<ExpandEntry>& nodes_set,
                        const GHistIndexMatrix& gmat,
                        const HistCollection& hist,
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@ -96,6 +96,31 @@ class QuantileHistMock : public QuantileHistMaker {
      }
    }

+    void TestInitDataSampling(const GHistIndexMatrix& gmat,
+                      const std::vector<GradientPair>& gpair,
+                      DMatrix* p_fmat,
+                      const RegTree& tree) {
+      const size_t nthreads = omp_get_num_threads();
+      // save state of global rng engine
+      auto initial_rnd = common::GlobalRandom();
+      RealImpl::InitData(gmat, gpair, *p_fmat, tree);
+      std::vector<size_t> row_indices_initial = *row_set_collection_.Data();
+
+      for (size_t i_nthreads = 1; i_nthreads < 4; ++i_nthreads) {
+        omp_set_num_threads(i_nthreads);
+        // return initial state of global rng engine
+        common::GlobalRandom() = initial_rnd;
+        RealImpl::InitData(gmat, gpair, *p_fmat, tree);
+        std::vector<size_t>& row_indices = *row_set_collection_.Data();
+        ASSERT_EQ(row_indices_initial.size(), row_indices.size());
+        for (size_t i = 0; i < row_indices_initial.size(); ++i) {
+          ASSERT_EQ(row_indices_initial[i], row_indices[i]);
+        }
+      }
+      omp_set_num_threads(nthreads);
+    }
+
+
    void TestBuildHist(int nid,
                       const GHistIndexMatrix& gmat,
                       const DMatrix& fmat,
@ -266,6 +291,20 @@ class QuantileHistMock : public QuantileHistMaker {
    builder_->TestInitData(gmat, gpair, dmat_.get(), tree);
  }

+  void TestInitDataSampling() {
+    size_t constexpr kMaxBins = 4;
+    common::GHistIndexMatrix gmat;
+    gmat.Init(dmat_.get(), kMaxBins);
+
+    RegTree tree = RegTree();
+    tree.param.UpdateAllowUnknown(cfg_);
+
+    std::vector<GradientPair> gpair =
+        { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
+          {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
+
+    builder_->TestInitDataSampling(gmat, gpair, dmat_.get(), tree);
+  }
  void TestBuildHist() {
    RegTree tree = RegTree();
    tree.param.UpdateAllowUnknown(cfg_);
@ -292,6 +331,15 @@ TEST(QuantileHist, InitData) {
  maker.TestInitData();
 }

+TEST(QuantileHist, InitDataSampling) {
+  const float subsample = 0.5;
+  std::vector<std::pair<std::string, std::string>> cfg
+      {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
+       {"subsample", std::to_string(subsample)}};
+  QuantileHistMock maker(cfg);
+  maker.TestInitDataSampling();
+}
+
 TEST(QuantileHist, BuildHist) {
  // Don't enable feature grouping
  std::vector<std::pair<std::string, std::string>> cfg