De-duplicate GPU parameters. (#4454)

* Only define `gpu_id` and `n_gpus` in `LearnerTrainParam` * Pass LearnerTrainParam through XGBoost vid factory method. * Disable all GPU usage when GPU related parameters are not specified (fixes XGBoost choosing GPU over aggressively). * Test learner train param io. * Fix gpu pickling.
2019-05-29 11:55:57 +08:00
parent a3fedbeaa8
commit c589eff941
69 changed files with 927 additions and 562 deletions
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -72,10 +72,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  bool refresh_leaf;
  // auxiliary data structure
  std::vector<int> monotone_constraints;
-  // gpu to use for single gpu algorithms
-  int gpu_id;
-  // number of GPUs to use
-  int n_gpus;
  // the criteria to use for ranking splits
  std::string split_evaluator;

@@ -191,14 +187,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
    DMLC_DECLARE_FIELD(monotone_constraints)
        .set_default(std::vector<int>())
        .describe("Constraint of variable monotonicity");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for single gpu algorithms");
-    DMLC_DECLARE_FIELD(n_gpus)
-        .set_lower_bound(-1)
-        .set_default(1)
-        .describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
    DMLC_DECLARE_FIELD(split_evaluator)
        .set_default("elastic_net,monotonic,interaction")
        .describe("The criteria to use for ranking splits");
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@@ -14,12 +14,14 @@ DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);

 namespace xgboost {

-TreeUpdater* TreeUpdater::Create(const std::string& name) {
+TreeUpdater* TreeUpdater::Create(const std::string& name, LearnerTrainParam const* tparam) {
  auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name);
  if (e == nullptr) {
    LOG(FATAL) << "Unknown tree updater " << name;
  }
-  return (e->body)();
+  auto p_updater = (e->body)();
+  p_updater->tparam_ = tparam;
+  return p_updater;
 }

 }  // namespace xgboost
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -769,7 +769,7 @@ class DistColMaker : public ColMaker {
 public:
  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    pruner_.reset(TreeUpdater::Create("prune"));
+    pruner_.reset(TreeUpdater::Create("prune", tparam_));
    pruner_->Init(args);
    spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
    spliteval_->Init(args);
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -443,9 +443,10 @@ void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
                 common::Span<const DeviceNodeStats> nodes,
                 int nUniqKeys,
                 NodeIdT nodeStart, int len, const TrainParam param,
-                 ArgMaxByKeyAlgo algo) {
+                 ArgMaxByKeyAlgo algo,
+                 GPUSet const& devices) {
  dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
-      param.gpu_id, nodeSplits.data(), nUniqKeys,
+      *(devices.begin()), nodeSplits.data(), nUniqKeys,
      ExactSplitCandidate());
  int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
  switch (algo) {
@@ -585,7 +586,7 @@ class GPUMaker : public TreeUpdater {
     maxNodes_ = (1 << (param_.max_depth + 1)) - 1;
     maxLeaves_ = 1 << param_.max_depth;

-     devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
+     devices_ = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
  }

  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
@@ -641,7 +642,7 @@ class GPUMaker : public TreeUpdater {
    float min_split_loss = param_.min_split_loss;
    auto gpu_param = GPUTrainingParam(param_);

-    dh::LaunchN(param_.gpu_id, nNodes, [=] __device__(int uid) {
+    dh::LaunchN(*(devices_.begin()), nNodes, [=] __device__(int uid) {
      int absNodeId = uid + nodeStart;
      ExactSplitCandidate s = d_nodeSplits[uid];
      if (s.IsSplittable(min_split_loss)) {
@@ -683,16 +684,18 @@ class GPUMaker : public TreeUpdater {
                    instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
                    n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_,
                    colIds_, nodeStart);
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
    ArgMaxByKey(nodeSplits_, gradscans_, gradsums_,
                vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(),
                nodes_, nNodes, nodeStart, n_vals_, param_,
-                level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
+                level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem,
+                devices);
    Split2Node(nNodes, nodeStart);
  }

  void AllocateAllData(int offsetSize) {
    int tmpBuffSize = ScanTempBufferSize(n_vals_);
-    ba_.Allocate(param_.gpu_id, &vals_, n_vals_,
+    ba_.Allocate(*(devices_.begin()), &vals_, n_vals_,
                 &vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_,
                 &colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_,
                 &nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_,
@@ -783,7 +786,7 @@ class GPUMaker : public TreeUpdater {
      auto d_nodes = nodes_;
      auto d_sums = gradsums_;
      auto gpu_params = GPUTrainingParam(param_);
-      dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) {
+      dh::LaunchN(*(devices_.begin()), 1, [=] __device__(int idx) {
        d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
      });
    } else {
@@ -800,7 +803,7 @@ class GPUMaker : public TreeUpdater {
          nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
          colOffsets_.data(), vals_.Current(), n_vals_, n_cols_);
      // gather the node assignments across all other columns too
-      dh::Gather(param_.gpu_id, nodeAssigns_.Current(),
+      dh::Gather(*(devices_.begin()), nodeAssigns_.Current(),
                 node_assigns_per_inst_.data(), instIds_.Current(), n_vals_);
      SortKeys(level);
    }
@@ -811,7 +814,7 @@ class GPUMaker : public TreeUpdater {
    // but we don't need more than level+1 bits for sorting!
    SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_,
                  colOffsets_, 0, level + 1);
-    dh::Gather<float, int>(param_.gpu_id, vals_.other(),
+    dh::Gather<float, int>(*(devices_.begin()), vals_.other(),
                           vals_.Current(), instIds_.other(), instIds_.Current(),
                           nodeLocations_.Current(), n_vals_);
    vals_.buff.selector ^= 1;
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2018 XGBoost contributors
+ * Copyright 2017-2019 XGBoost contributors
 */
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -322,7 +322,7 @@ __global__ void EvaluateSplitKernel(
        node_histogram,               // histogram for gradients
    common::Span<const int> feature_set,  // Selected features
    DeviceNodeStats node,
-  ELLPackMatrix matrix,
+    ELLPackMatrix matrix,
    GPUTrainingParam gpu_param,
    common::Span<DeviceSplitCandidate> split_candidates,  // resulting split
    ValueConstraint value_constraint,
@@ -1377,13 +1377,16 @@ template <typename GradientSumT>
 class GPUHistMakerSpecialised{
 public:
  GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
-  void Init(
-      const std::vector<std::pair<std::string, std::string>>& args) {
+  void Init(const std::vector<std::pair<std::string, std::string>>& args,
+            LearnerTrainParam const* lparam) {
    param_.InitAllowUnknown(args);
+    learner_param_ = lparam;
    hist_maker_param_.InitAllowUnknown(args);
-    CHECK(param_.n_gpus != 0) << "Must have at least one device";
-    n_devices_ = param_.n_gpus;
-    dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus));
+    auto devices = GPUSet::All(learner_param_->gpu_id,
+                               learner_param_->n_gpus);
+    n_devices_ = devices.Size();
+    CHECK(n_devices_ != 0) << "Must have at least one device";
+    dist_ = GPUDistribution::Block(devices);

    dh::CheckComputeCapability();

@@ -1446,7 +1449,8 @@ class GPUHistMakerSpecialised{

    // Find the cuts.
    monitor_.StartCuda("Quantiles");
-    common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
+    common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows,
+                         GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus));
    n_bins_ = hmat_.row_ptr.back();
    monitor_.StopCuda("Quantiles");
    auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
@@ -1552,6 +1556,7 @@ class GPUHistMakerSpecialised{
  int n_bins_;

  GPUHistMakerTrainParam hist_maker_param_;
+  LearnerTrainParam const* learner_param_;
  common::GHistIndexMatrix gmat_;

  dh::AllReducer reducer_;
@@ -1573,10 +1578,10 @@ class GPUHistMaker : public TreeUpdater {
    double_maker_.reset();
    if (hist_maker_param_.single_precision_histogram) {
      float_maker_.reset(new GPUHistMakerSpecialised<GradientPair>());
-      float_maker_->Init(args);
+      float_maker_->Init(args, tparam_);
    } else {
      double_maker_.reset(new GPUHistMakerSpecialised<GradientPairPrecise>());
-      double_maker_->Init(args);
+      double_maker_->Init(args, tparam_);
    }
  }

--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -22,7 +22,7 @@ DMLC_REGISTRY_FILE_TAG(updater_prune);
 class TreePruner: public TreeUpdater {
 public:
  TreePruner() {
-    syncher_.reset(TreeUpdater::Create("sync"));
+    syncher_.reset(TreeUpdater::Create("sync", tparam_));
  }
  // set training parameter
  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -35,7 +35,7 @@ DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
 void QuantileHistMaker::Init(const std::vector<std::pair<std::string, std::string> >& args) {
  // initialize pruner
  if (!pruner_) {
-    pruner_.reset(TreeUpdater::Create("prune"));
+    pruner_.reset(TreeUpdater::Create("prune", tparam_));
  }
  pruner_->Init(args);
  param_.InitAllowUnknown(args);