Specify the number of threads for parallel sort. (#8735)

* Specify the number of threads for parallel sort. - Pass context object into argsort. - Replace macros with inline functions.
2023-02-16 00:20:19 +08:00
parent c7c485d052
commit 282b1729da
24 changed files with 254 additions and 143 deletions
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -34,10 +34,10 @@ class HistEvaluator {
  };

 private:
+  Context const* ctx_;
  TrainParam param_;
  std::shared_ptr<common::ColumnSampler> column_sampler_;
  TreeEvaluator tree_evaluator_;
-  int32_t n_threads_ {0};
  FeatureInteractionConstraintHost interaction_constraints_;
  std::vector<NodeEntry> snode_;

@@ -283,6 +283,7 @@ class HistEvaluator {
  void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
                      common::Span<FeatureType const> feature_types, const RegTree &tree,
                      std::vector<ExpandEntry> *p_entries) {
+    auto n_threads = ctx_->Threads();
    auto& entries = *p_entries;
    // All nodes are on the same level, so we can store the shared ptr.
    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
@@ -294,23 +295,23 @@ class HistEvaluator {
    }
    CHECK(!features.empty());
    const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads_);
+        std::max<size_t>(1, features.front()->Size() / n_threads);
    common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
      return features[nidx_in_set]->Size();
    }, grain_size);

-    std::vector<ExpandEntry> tloc_candidates(n_threads_ * entries.size());
+    std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
    for (size_t i = 0; i < entries.size(); ++i) {
-      for (decltype(n_threads_) j = 0; j < n_threads_; ++j) {
-        tloc_candidates[i * n_threads_ + j] = entries[i];
+      for (decltype(n_threads) j = 0; j < n_threads; ++j) {
+        tloc_candidates[i * n_threads + j] = entries[i];
      }
    }
    auto evaluator = tree_evaluator_.GetEvaluator();
    auto const& cut_ptrs = cut.Ptrs();

-    common::ParallelFor2d(space, n_threads_, [&](size_t nidx_in_set, common::Range1d r) {
+    common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
      auto tidx = omp_get_thread_num();
-      auto entry = &tloc_candidates[n_threads_ * nidx_in_set + tidx];
+      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
      auto best = &entry->split;
      auto nidx = entry->nid;
      auto histogram = hist[nidx];
@@ -349,9 +350,9 @@ class HistEvaluator {

    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
         ++nidx_in_set) {
-      for (auto tidx = 0; tidx < n_threads_; ++tidx) {
+      for (auto tidx = 0; tidx < n_threads; ++tidx) {
        entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads_ * nidx_in_set + tidx].split);
+            tloc_candidates[n_threads * nidx_in_set + tidx].split);
      }
    }
  }
@@ -424,15 +425,15 @@ class HistEvaluator {
 public:
  // The column sampler must be constructed by caller since we need to preserve the rng
  // for the entire training session.
-  explicit HistEvaluator(TrainParam const &param, MetaInfo const &info, int32_t n_threads,
+  explicit HistEvaluator(Context const* ctx, TrainParam const &param, MetaInfo const &info,
                         std::shared_ptr<common::ColumnSampler> sampler)
-      : param_{param},
+      : ctx_{ctx}, param_{param},
        column_sampler_{std::move(sampler)},
-        tree_evaluator_{param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
-        n_threads_{n_threads} {
+        tree_evaluator_{param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId} {
    interaction_constraints_.Configure(param, info.num_col_);
-    column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(), param_.colsample_bynode,
-                          param_.colsample_bylevel, param_.colsample_bytree);
+    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
+                          param_.colsample_bynode, param_.colsample_bylevel,
+                          param_.colsample_bytree);
  }
 };

--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -171,7 +171,7 @@ class GloablApproxBuilder {
                               common::Monitor *monitor)
      : param_{std::move(param)},
        col_sampler_{std::move(column_sampler)},
-        evaluator_{param_, info, ctx->Threads(), col_sampler_},
+        evaluator_{ctx, param_, info, col_sampler_},
        ctx_{ctx},
        task_{task},
        monitor_{monitor} {}
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -234,9 +234,9 @@ class ColMaker: public TreeUpdater {
        }
      }
      {
-        column_sampler_.Init(fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
-                             param_.colsample_bynode, param_.colsample_bylevel,
-                             param_.colsample_bytree);
+        column_sampler_.Init(ctx_, fmat.Info().num_col_,
+                             fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
+                             param_.colsample_bylevel, param_.colsample_bytree);
      }
      {
        // setup temp space for each thread
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -243,7 +243,7 @@ struct GPUHistMakerDevice {
  // thread safe
  void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
    auto const& info = dmat->Info();
-    this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
+    this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
                              param.colsample_bynode, param.colsample_bylevel,
                              param.colsample_bytree);
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -290,8 +290,7 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,

  // store a pointer to the tree
  p_last_tree_ = &tree;
-  evaluator_.reset(
-      new HistEvaluator<CPUExpandEntry>{param_, info, this->ctx_->Threads(), column_sampler_});
+  evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});

  monitor_->Stop(__func__);
 }