De-duplicate GPU parameters. (#4454)

* Only define `gpu_id` and `n_gpus` in `LearnerTrainParam` * Pass LearnerTrainParam through XGBoost vid factory method. * Disable all GPU usage when GPU related parameters are not specified (fixes XGBoost choosing GPU over aggressively). * Test learner train param io. * Fix gpu pickling.
2019-05-29 11:55:57 +08:00
parent a3fedbeaa8
commit c589eff941
69 changed files with 927 additions and 562 deletions
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -1,9 +1,10 @@
 /*!
- * Copyright 2015-2018 by Contributors
+ * Copyright 2015-2019 by Contributors
 * \file common.cc
 * \brief Enable all kinds of global variables in common.
 */
 #include <dmlc/thread_local.h>
+#include <xgboost/logging.h>

 #include "common.h"
 #include "./random.h"
@@ -29,4 +30,39 @@ int AllVisibleImpl::AllVisible() {
 }
 #endif  // !defined(XGBOOST_USE_CUDA)

+constexpr GPUSet::GpuIdType GPUSet::kAll;
+
+GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
+  CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
+  CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
+
+  GpuIdType const n_devices_visible = AllVisible().Size();
+  if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
+    LOG(DEBUG) << "Runing on CPU.";
+    return Empty();
+  }
+
+  GpuIdType const n_available_devices = n_devices_visible - gpu_id;
+
+  if (n_gpus == kAll) {  // Use all devices starting from `gpu_id'.
+    CHECK(gpu_id < n_devices_visible)
+        << "\ngpu_id should be less than number of visible devices.\ngpu_id: "
+        << gpu_id
+        << ", number of visible devices: "
+        << n_devices_visible;
+    GpuIdType n_devices =
+        n_available_devices < n_rows ? n_available_devices : n_rows;
+    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
+    return Range(gpu_id, n_devices);
+  } else {  // Use devices in ( gpu_id, gpu_id + n_gpus ).
+    CHECK_LE(n_gpus, n_available_devices)
+        << "Starting from gpu id: " << gpu_id << ", there are only "
+        << n_available_devices << " available devices, while n_gpus is set to: "
+        << n_gpus;
+    GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
+    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
+    return Range(gpu_id, n_devices);
+  }
+}
+
 }  // namespace xgboost
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -135,8 +135,8 @@ class Range {
  Iterator begin_;
  Iterator end_;
 };
-
 }  // namespace common
+
 struct AllVisibleImpl {
  static int AllVisible();
 };
@@ -160,33 +160,7 @@ class GPUSet {
  }
  /*! \brief n_gpus and num_rows both are upper bounds. */
  static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
-                    GpuIdType num_rows = std::numeric_limits<GpuIdType>::max()) {
-    CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
-    CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
-
-    GpuIdType const n_devices_visible = AllVisible().Size();
-    if (n_devices_visible == 0 || n_gpus == 0) { return Empty(); }
-
-    GpuIdType const n_available_devices = n_devices_visible - gpu_id;
-
-    if (n_gpus == kAll) {  // Use all devices starting from `gpu_id'.
-      CHECK(gpu_id < n_devices_visible)
-          << "\ngpu_id should be less than number of visible devices.\ngpu_id: "
-          << gpu_id
-          << ", number of visible devices: "
-          << n_devices_visible;
-      GpuIdType n_devices =
-          n_available_devices < num_rows ? n_available_devices : num_rows;
-      return Range(gpu_id, n_devices);
-    } else {  // Use devices in ( gpu_id, gpu_id + n_gpus ).
-      CHECK_LE(n_gpus, n_available_devices)
-          << "Starting from gpu id: " << gpu_id << ", there are only "
-          << n_available_devices << " available devices, while n_gpus is set to: "
-          << n_gpus;
-      GpuIdType n_devices = n_gpus < num_rows ? n_gpus : num_rows;
-      return Range(gpu_id, n_devices);
-    }
-  }
+                    GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());

  static GPUSet AllVisible() {
    GpuIdType n =  AllVisibleImpl::AllVisible();
--- a/src/common/enum_class_param.h
+++ b/src/common/enum_class_param.h
@@ -1,81 +0,0 @@
-/*!
- * Copyright 2018 by Contributors
- * \file enum_class_param.h
- * \brief macro for using C++11 enum class as DMLC parameter
- * \author Hyunsu Philip Cho
- */
-
-#ifndef XGBOOST_COMMON_ENUM_CLASS_PARAM_H_
-#define XGBOOST_COMMON_ENUM_CLASS_PARAM_H_
-
-#include <dmlc/parameter.h>
-#include <string>
-#include <type_traits>
-
-/*!
- * \brief Specialization of FieldEntry for enum class (backed by int)
- *
- * Use this macro to use C++11 enum class as DMLC parameters
- *
- * Usage:
- *
- * \code{.cpp}
- *
- *   // enum class must inherit from int type
- *   enum class Foo : int {
- *     kBar = 0, kFrog = 1, kCat = 2, kDog = 3
- *   };
- *
- *   // This line is needed to prevent compilation error
- *   DECLARE_FIELD_ENUM_CLASS(Foo);
- *
- *   // Now define DMLC parameter as usual;
- *   //   enum classes can now be members.
- *   struct MyParam : dmlc::Parameter<MyParam> {
- *     Foo foo;
- *     DMLC_DECLARE_PARAMETER(MyParam) {
- *       DMLC_DECLARE_FIELD(foo)
- *         .set_default(Foo::kBar)
- *         .add_enum("bar", Foo::kBar)
- *         .add_enum("frog", Foo::kFrog)
- *         .add_enum("cat", Foo::kCat)
- *         .add_enum("dog", Foo::kDog);
- *     }
- *   };
- *
- *   DMLC_REGISTER_PARAMETER(MyParam);
- * \endcode
- */
-#define DECLARE_FIELD_ENUM_CLASS(EnumClass) \
-namespace dmlc {  \
-namespace parameter {  \
-template <>  \
-class FieldEntry<EnumClass> : public FieldEntry<int> {  \
- public:  \
-  FieldEntry<EnumClass>() {  \
-    static_assert(  \
-      std::is_same<int, typename std::underlying_type<EnumClass>::type>::value,  \
-      "enum class must be backed by int");  \
-    is_enum_ = true;  \
-  }  \
-  using Super = FieldEntry<int>;  \
-  void Set(void *head, const std::string &value) const override {  \
-    Super::Set(head, value);  \
-  }  \
-  inline FieldEntry<EnumClass>& add_enum(const std::string &key, EnumClass value) {  \
-    Super::add_enum(key, static_cast<int>(value));  \
-    return *this;  \
-  }  \
-  inline FieldEntry<EnumClass>& set_default(const EnumClass& default_value) {  \
-    default_value_ = static_cast<int>(default_value);  \
-    has_default_ = true;  \
-    return *this;  \
-  }  \
-  inline void Init(const std::string &key, void *head, EnumClass& ref) {  /* NOLINT */  \
-    Super::Init(key, head, *reinterpret_cast<int*>(&ref));  \
-  }  \
-};  \
-}  /* namespace parameter */  \
-}  /* namespace dmlc */
-
-#endif  // XGBOOST_COMMON_ENUM_CLASS_PARAM_H_
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -383,8 +383,8 @@ struct GPUSketcher {
    hmat->Init(&sketches, param_.max_bin);
  }

-  GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
-    dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus, n_rows));
+  GPUSketcher(tree::TrainParam param, GPUSet const& devices) : param_(std::move(param)) {
+    dist_ = GPUDistribution::Block(devices);
  }

 private:
@@ -395,8 +395,9 @@ struct GPUSketcher {

 void DeviceSketch
  (const SparsePage& batch, const MetaInfo& info,
-   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows) {
-  GPUSketcher sketcher(param, info.num_row_);
+   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows,
+   GPUSet const& devices) {
+  GPUSketcher sketcher(param, devices);
  sketcher.Sketch(batch, info, hmat, gpu_batch_nrows);
 }

--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -118,7 +118,8 @@ struct HistCutMatrix {
 /*! \brief Builds the cut matrix on the GPU */
 void DeviceSketch
  (const SparsePage& batch, const MetaInfo& info,
-   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows);
+   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows,
+   GPUSet const& devices);

 /*!
 * \brief A single row in global histogram index.
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017 XGBoost contributors
+ * Copyright 2017-2019 XGBoost contributors
 */

 /**
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -62,7 +62,7 @@ class GBLinear : public GradientBooster {
      model_.param.InitAllowUnknown(cfg);
    }
    param_.InitAllowUnknown(cfg);
-    updater_.reset(LinearUpdater::Create(param_.updater));
+    updater_.reset(LinearUpdater::Create(param_.updater, learner_param_));
    updater_->Init(cfg);
    monitor_.Init("GBLinear");
  }
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@@ -4,6 +4,7 @@
 #pragma once
 #include <dmlc/io.h>
 #include <dmlc/parameter.h>
+#include <xgboost/base.h>
 #include <xgboost/feature_map.h>
 #include <vector>
 #include <string>
--- a/src/gbm/gbm.cc
+++ b/src/gbm/gbm.cc
@@ -13,13 +13,16 @@ DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg);
 namespace xgboost {
 GradientBooster* GradientBooster::Create(
    const std::string& name,
+    LearnerTrainParam const* learner_param,
    const std::vector<std::shared_ptr<DMatrix> >& cache_mats,
    bst_float base_margin) {
  auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name);
  if (e == nullptr) {
    LOG(FATAL) << "Unknown gbm type " << name;
  }
-  return (e->body)(cache_mats, base_margin);
+  auto p_bst =  (e->body)(cache_mats, base_margin);
+  p_bst->learner_param_ = learner_param;
+  return p_bst;
 }

 }  // namespace xgboost
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -147,7 +147,7 @@ class GBTree : public GradientBooster {
    }

    // configure predictor
-    predictor_ = std::unique_ptr<Predictor>(Predictor::Create(tparam_.predictor));
+    predictor_ = std::unique_ptr<Predictor>(Predictor::Create(tparam_.predictor, learner_param_));
    predictor_->Init(cfg, cache_);
    monitor_.Init("GBTree");
  }
@@ -252,7 +252,7 @@ class GBTree : public GradientBooster {
    std::string tval = tparam_.updater_seq;
    std::vector<std::string> ups = common::Split(tval, ',');
    for (const std::string& pstr : ups) {
-      std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr.c_str()));
+      std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr.c_str(), learner_param_));
      up->Init(this->cfg_);
      updaters_.push_back(std::move(up));
    }
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2014 by Contributors
+ * Copyright 2014-2019 by Contributors
 * \file learner.cc
 * \brief Implementation of learning algorithm.
 * \author Tianqi Chen
@@ -8,6 +8,7 @@
 #include <dmlc/timer.h>
 #include <xgboost/learner.h>
 #include <xgboost/logging.h>
+#include <xgboost/generic_parameters.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -20,22 +21,12 @@
 #include "./common/host_device_vector.h"
 #include "./common/io.h"
 #include "./common/random.h"
-#include "./common/enum_class_param.h"
 #include "./common/timer.h"

 namespace {

 const char* kMaxDeltaStepDefaultValue = "0.7";

-enum class TreeMethod : int {
-  kAuto = 0, kApprox = 1, kExact = 2, kHist = 3,
-  kGPUExact = 4, kGPUHist = 5
-};
-
-enum class DataSplitMode : int {
-  kAuto = 0, kCol = 1, kRow = 2
-};
-
 inline bool IsFloat(const std::string& str) {
  std::stringstream ss(str);
  float f;
@@ -58,9 +49,6 @@ inline std::string RenderParamVal(const std::string& str) {

 }  // anonymous namespace

-DECLARE_FIELD_ENUM_CLASS(TreeMethod);
-DECLARE_FIELD_ENUM_CLASS(DataSplitMode);
-
 namespace xgboost {
 // implementation of base learner.
 bool Learner::AllowLazyCheckPoint() const {
@@ -108,56 +96,6 @@ struct LearnerModelParam : public dmlc::Parameter<LearnerModelParam> {
  }
 };

-struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
-  // stored random seed
-  int seed;
-  // whether seed the PRNG each iteration
-  bool seed_per_iteration;
-  // data split mode, can be row, col, or none.
-  DataSplitMode dsplit;
-  // tree construction method
-  TreeMethod tree_method;
-  // internal test flag
-  std::string test_flag;
-  // number of threads to use if OpenMP is enabled
-  // if equals 0, use system default
-  int nthread;
-  // flag to disable default metric
-  int disable_default_eval_metric;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
-    DMLC_DECLARE_FIELD(seed).set_default(0).describe(
-        "Random number seed during training.");
-    DMLC_DECLARE_FIELD(seed_per_iteration)
-        .set_default(false)
-        .describe(
-            "Seed PRNG determnisticly via iterator number, "
-            "this option will be switched on automatically on distributed "
-            "mode.");
-    DMLC_DECLARE_FIELD(dsplit)
-        .set_default(DataSplitMode::kAuto)
-        .add_enum("auto", DataSplitMode::kAuto)
-        .add_enum("col", DataSplitMode::kCol)
-        .add_enum("row", DataSplitMode::kRow)
-        .describe("Data split mode for distributed training.");
-    DMLC_DECLARE_FIELD(tree_method)
-        .set_default(TreeMethod::kAuto)
-        .add_enum("auto", TreeMethod::kAuto)
-        .add_enum("approx", TreeMethod::kApprox)
-        .add_enum("exact", TreeMethod::kExact)
-        .add_enum("hist", TreeMethod::kHist)
-        .add_enum("gpu_exact", TreeMethod::kGPUExact)
-        .add_enum("gpu_hist", TreeMethod::kGPUHist)
-        .describe("Choice of tree construction method.");
-    DMLC_DECLARE_FIELD(test_flag).set_default("").describe(
-        "Internal test flag");
-    DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
-        "Number of threads to use.");
-    DMLC_DECLARE_FIELD(disable_default_eval_metric)
-        .set_default(0)
-        .describe("flag to disable default metric. Set to >0 to disable");
-  }
-};

 DMLC_REGISTER_PARAMETER(LearnerModelParam);
 DMLC_REGISTER_PARAMETER(LearnerTrainParam);
@@ -237,6 +175,29 @@ class LearnerImpl : public Learner {
    }
  }

+  void ConfigureObjective() {
+    if (cfg_.count("num_class") != 0) {
+      cfg_["num_output_group"] = cfg_["num_class"];
+      if (atoi(cfg_["num_class"].c_str()) > 1 && cfg_.count("objective") == 0) {
+        cfg_["objective"] = "multi:softmax";
+      }
+    }
+
+    if (cfg_.find("max_delta_step") == cfg_.cend() &&
+        cfg_.find("objective") != cfg_.cend() &&
+        cfg_["objective"] == "count:poisson") {
+      cfg_["max_delta_step"] = kMaxDeltaStepDefaultValue;
+    }
+
+    if (cfg_.count("objective") == 0) {
+      cfg_["objective"] = "reg:squarederror";
+    }
+    if (cfg_.count("booster") == 0) {
+      cfg_["booster"] = "gbtree";
+    }
+  }
+
+  // Configuration before data is known.
  void Configure(
      const std::vector<std::pair<std::string, std::string> >& args) override {
    // add to configurations
@@ -252,7 +213,7 @@ class LearnerImpl : public Learner {
          return m->Name() != kv.second;
        };
        if (std::all_of(metrics_.begin(), metrics_.end(), dup_check)) {
-          metrics_.emplace_back(Metric::Create(kv.second));
+          metrics_.emplace_back(Metric::Create(kv.second, &tparam_));
          mparam_.contain_eval_metrics = 1;
        }
      } else {
@@ -268,27 +229,11 @@ class LearnerImpl : public Learner {
    if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
      tparam_.dsplit = DataSplitMode::kRow;
    }
-    if (cfg_.count("num_class") != 0) {
-      cfg_["num_output_group"] = cfg_["num_class"];
-      if (atoi(cfg_["num_class"].c_str()) > 1 && cfg_.count("objective") == 0) {
-        cfg_["objective"] = "multi:softmax";
-      }
-    }
-
-    if (cfg_.count("max_delta_step") == 0 && cfg_.count("objective") != 0 &&
-        cfg_["objective"] == "count:poisson") {
-      cfg_["max_delta_step"] = kMaxDeltaStepDefaultValue;
-    }
-
-    if (cfg_.count("objective") == 0) {
-      cfg_["objective"] = "reg:squarederror";
-    }
-    if (cfg_.count("booster") == 0) {
-      cfg_["booster"] = "gbtree";
-    }

+    ConfigureObjective();
    ConfigureUpdaters();

+    // FIXME(trivialfis): So which one should go first? Init or Configure?
    if (!this->ModelInitialized()) {
      mparam_.InitAllowUnknown(args);
      name_obj_ = cfg_["objective"];
@@ -315,7 +260,27 @@ class LearnerImpl : public Learner {

  void InitModel() override { this->LazyInitModel(); }

+  // Configuration can only be done after data is known
+  void ConfigurationWithKnownData(DMatrix* dmat) {
+    CHECK(ModelInitialized())
+        << "Always call InitModel or Load before any evaluation.";
+    this->ValidateDMatrix(dmat);
+    // Configure GPU parameters
+    // FIXME(trivialfis): How do we know dependent parameters are all set?
+    if (tparam_.tree_method == TreeMethod::kGPUHist ||
+        tparam_.tree_method == TreeMethod::kGPUExact ||
+        (cfg_.find("updater") != cfg_.cend() && cfg_.at("updater") == "gpu_coord_descent") ||
+        (cfg_.find("predictor") != cfg_.cend() &&
+         cfg_.at("predictor") == "gpu_predictor")) {
+      if (cfg_.find("n_gpus") == cfg_.cend()) {
+        tparam_.n_gpus = 1;
+      }
+    }
+  }
+
  void Load(dmlc::Stream* fi) override {
+    tparam_ = LearnerTrainParam();
+    tparam_.Init(std::vector<std::pair<std::string, std::string>>{});
    // TODO(tqchen) mark deprecation of old format.
    common::PeekableInStream fp(fi);
    // backward compatible header check.
@@ -352,8 +317,9 @@ class LearnerImpl : public Learner {
    }
    CHECK(fi->Read(&name_gbm_)) << "BoostLearner: wrong model format";
    // duplicated code with LazyInitModel
-    obj_.reset(ObjFunction::Create(name_obj_));
-    gbm_.reset(GradientBooster::Create(name_gbm_, cache_, mparam_.base_score));
+    obj_.reset(ObjFunction::Create(name_obj_, &tparam_));
+    gbm_.reset(GradientBooster::Create(name_gbm_, &tparam_,
+                                       cache_, mparam_.base_score));
    gbm_->Load(fi);
    if (mparam_.contain_extra_attrs != 0) {
      std::vector<std::pair<std::string, std::string> > attr;
@@ -380,14 +346,13 @@ class LearnerImpl : public Learner {
              << "  * JVM packages:   bst.setParam(\""
              << saved_param << "\", [new value])";
          }
-#else
-          if (saved_param == "predictor" && kv.second == "gpu_predictor") {
-            LOG(INFO) << "Parameter 'predictor' will be set to 'cpu_predictor' "
-                      << "since XGBoost wasn't compiled with GPU support.";
+#endif  // XGBOOST_USE_CUDA
+          // NO visiable GPU on current environment
+          if (GPUSet::AllVisible().Size() == 0 &&
+              (saved_param == "predictor" && kv.second == "gpu_predictor")) {
            cfg_["predictor"] = "cpu_predictor";
            kv.second = "cpu_predictor";
          }
-#endif  // XGBOOST_USE_CUDA
        }
      }
      attributes_ =
@@ -402,7 +367,8 @@ class LearnerImpl : public Learner {
      std::vector<std::string> metr;
      fi->Read(&metr);
      for (auto name : metr) {
-        metrics_.emplace_back(Metric::Create(name));
+        metrics_.emplace_back(
+            Metric::Create(name, &tparam_));
      }
    }
    cfg_["num_class"] = common::ToString(mparam_.num_class);
@@ -475,14 +441,11 @@ class LearnerImpl : public Learner {
  void UpdateOneIter(int iter, DMatrix* train) override {
    monitor_.Start("UpdateOneIter");

-    // TODO(trivialfis): Merge the duplicated code with BoostOneIter
-    CHECK(ModelInitialized())
-        << "Always call InitModel or LoadModel before update";
    if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
      common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
    }
-    this->ValidateDMatrix(train);
    this->PerformTreeMethodHeuristic(train);
+    this->ConfigurationWithKnownData(train);

    monitor_.Start("PredictRaw");
    this->PredictRaw(train, &preds_[train]);
@@ -497,14 +460,11 @@ class LearnerImpl : public Learner {
  void BoostOneIter(int iter, DMatrix* train,
                    HostDeviceVector<GradientPair>* in_gpair) override {
    monitor_.Start("BoostOneIter");
-
-    CHECK(ModelInitialized())
-        << "Always call InitModel or LoadModel before boost.";
    if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
      common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
    }
-    this->ValidateDMatrix(train);
    this->PerformTreeMethodHeuristic(train);
+    this->ConfigurationWithKnownData(train);

    gbm_->DoBoost(train, in_gpair);
    monitor_.Stop("BoostOneIter");
@@ -513,14 +473,16 @@ class LearnerImpl : public Learner {
  std::string EvalOneIter(int iter, const std::vector<DMatrix*>& data_sets,
                          const std::vector<std::string>& data_names) override {
    monitor_.Start("EvalOneIter");
+
    std::ostringstream os;
    os << '[' << iter << ']' << std::setiosflags(std::ios::fixed);
    if (metrics_.size() == 0 && tparam_.disable_default_eval_metric <= 0) {
-      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
+      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &tparam_));
      metrics_.back()->Configure(cfg_.begin(), cfg_.end());
    }
    for (size_t i = 0; i < data_sets.size(); ++i) {
      DMatrix * dmat = data_sets[i];
+      this->ConfigurationWithKnownData(dmat);
      this->PredictRaw(data_sets[i], &preds_[dmat]);
      obj_->EvalTransform(&preds_[dmat]);
      for (auto& ev : metrics_) {
@@ -562,10 +524,15 @@ class LearnerImpl : public Learner {
    return out;
  }

+  LearnerTrainParam const& GetLearnerTrainParameter() const override {
+    return tparam_;
+  }
+
  std::pair<std::string, bst_float> Evaluate(DMatrix* data,
                                             std::string metric) {
    if (metric == "auto") metric = obj_->DefaultEvalMetric();
-    std::unique_ptr<Metric> ev(Metric::Create(metric.c_str()));
+    std::unique_ptr<Metric> ev(Metric::Create(metric.c_str(), &tparam_));
+    this->ConfigurationWithKnownData(data);
    this->PredictRaw(data, &preds_[data]);
    obj_->EvalTransform(&preds_[data]);
    return std::make_pair(metric,
@@ -577,6 +544,10 @@ class LearnerImpl : public Learner {
               HostDeviceVector<bst_float>* out_preds, unsigned ntree_limit,
               bool pred_leaf, bool pred_contribs, bool approx_contribs,
               bool pred_interactions) const override {
+    bool multiple_predictions = static_cast<int>(pred_leaf) +
+                                static_cast<int>(pred_interactions) +
+                                static_cast<int>(pred_contribs);
+    CHECK_LE(multiple_predictions, 1) << "Perform one kind of prediction at a time.";
    if (pred_contribs) {
      gbm_->PredictContribution(data, &out_preds->HostVector(), ntree_limit, approx_contribs);
    } else if (pred_interactions) {
@@ -628,7 +599,7 @@ class LearnerImpl : public Learner {
        // things are okay, do nothing
        break;
       case TreeMethod::kExact:
-        LOG(CONSOLE) << "Tree method was set to be "
+        LOG(WARNING) << "Tree method was set to be "
                     << "exact"
                     << "', but only 'approx' and 'hist' is available for distributed "
                        "training. The `tree_method` parameter is now being "
@@ -643,11 +614,11 @@ class LearnerImpl : public Learner {
                   << static_cast<int>(current_tree_method) << ") detected";
      }
      if (current_tree_method != TreeMethod::kHist) {
-        LOG(CONSOLE) << "Tree method is automatically selected to be 'approx'"
+        LOG(WARNING) << "Tree method is automatically selected to be 'approx'"
                        " for distributed training.";
        tparam_.tree_method = TreeMethod::kApprox;
      } else {
-        LOG(CONSOLE) << "Tree method is specified to be 'hist'"
+        LOG(WARNING) << "Tree method is specified to be 'hist'"
                        " for distributed training.";
        tparam_.tree_method = TreeMethod::kHist;
      }
@@ -701,7 +672,7 @@ class LearnerImpl : public Learner {

  // return whether model is already initialized.
  inline bool ModelInitialized() const { return gbm_ != nullptr; }
-  // lazily initialize the model if it haven't yet been initialized.
+  // lazily initialize the model based on configuration if it haven't yet been initialized.
  inline void LazyInitModel() {
    if (this->ModelInitialized()) return;
    // estimate feature bound
@@ -725,13 +696,15 @@ class LearnerImpl : public Learner {
    // setup
    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
    CHECK(obj_ == nullptr && gbm_ == nullptr);
-    obj_.reset(ObjFunction::Create(name_obj_));
+    obj_.reset(ObjFunction::Create(name_obj_, &tparam_));
    obj_->Configure(cfg_.begin(), cfg_.end());
    // reset the base score
    mparam_.base_score = obj_->ProbToMargin(mparam_.base_score);
-    gbm_.reset(GradientBooster::Create(name_gbm_, cache_, mparam_.base_score));
+    gbm_.reset(GradientBooster::Create(name_gbm_, &tparam_,
+                                       cache_, mparam_.base_score));
    gbm_->Configure(cfg_.begin(), cfg_.end());
  }
+
  /*!
   * \brief get un-transformed prediction
   * \param data training data matrix
@@ -761,8 +734,6 @@ class LearnerImpl : public Learner {

  // model parameter
  LearnerModelParam mparam_;
-  // training parameter
-  LearnerTrainParam tparam_;
  // configurations
  std::map<std::string, std::string> cfg_;
  // attributes
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -9,7 +9,9 @@
 #include <vector>
 #include <limits>

+#include "xgboost/data.h"
 #include "./param.h"
+#include "../gbm/gblinear_model.h"
 #include "../common/random.h"

 namespace xgboost {
--- a/src/linear/linear_updater.cc
+++ b/src/linear/linear_updater.cc
@@ -11,12 +11,14 @@ DMLC_REGISTRY_ENABLE(::xgboost::LinearUpdaterReg);

 namespace xgboost {

-LinearUpdater* LinearUpdater::Create(const std::string& name) {
+LinearUpdater* LinearUpdater::Create(const std::string& name, LearnerTrainParam const* lparam) {
  auto *e = ::dmlc::Registry< ::xgboost::LinearUpdaterReg>::Get()->Find(name);
  if (e == nullptr) {
    LOG(FATAL) << "Unknown linear updater " << name;
  }
-  return (e->body)();
+  auto p_linear = (e->body)();
+  p_linear->learner_param_ = lparam;
+  return p_linear;
 }

 }  // namespace xgboost
--- a/src/linear/param.h
+++ b/src/linear/param.h
@@ -28,8 +28,6 @@ struct LinearTrainParam : public dmlc::Parameter<LinearTrainParam> {
  /*! \brief regularization weight for L1 norm */
  float reg_alpha;
  int feature_selector;
-  int n_gpus;
-  int gpu_id;
  // declare parameters
  DMLC_DECLARE_PARAMETER(LinearTrainParam) {
    DMLC_DECLARE_FIELD(learning_rate)
@@ -52,10 +50,6 @@ struct LinearTrainParam : public dmlc::Parameter<LinearTrainParam> {
        .add_enum("greedy", kGreedy)
        .add_enum("random", kRandom)
        .describe("Feature selection or ordering method.");
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe(
-        "Number of devices to use.");
-    DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe(
-        "Primary device ordinal.");
    // alias of parameters
    DMLC_DECLARE_ALIAS(learning_rate, eta);
    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -164,7 +164,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
                      const gbm::GBLinearModelParam &model_param) {
    if (!shards_.empty()) return;

-    dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus,
+    dist_ = GPUDistribution::Block(GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
                                               p_fmat->Info().num_row_));
    auto devices = dist_.Devices();

--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -314,11 +314,6 @@ struct EvalEWiseBase : public Metric {
  explicit EvalEWiseBase(char const* policy_param) :
    policy_{policy_param}, reducer_{policy_} {}

-  void Configure(
-      const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.InitAllowUnknown(args);
-  }
-
  bst_float Eval(const HostDeviceVector<bst_float>& preds,
                 const MetaInfo& info,
                 bool distributed) override {
@@ -328,7 +323,7 @@ struct EvalEWiseBase : public Metric {
        << "hint: use merror or mlogloss for multi-class classification";
    const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
    // Dealing with ndata < n_gpus.
-    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
+    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);

    auto result =
        reducer_.Reduce(devices, info.weights_, info.labels_, preds);
@@ -347,8 +342,6 @@ struct EvalEWiseBase : public Metric {
 private:
  Policy policy_;

-  MetricParam param_;
-
  ElementWiseMetricsReduction<Policy> reducer_;
 };

--- a/src/metric/metric.cc
+++ b/src/metric/metric.cc
@@ -1,19 +1,18 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2019 by Contributors
 * \file metric_registry.cc
 * \brief Registry of objective functions.
 */
-#include <xgboost/metric.h>
 #include <dmlc/registry.h>
-
-#include "metric_common.h"
+#include <xgboost/metric.h>
+#include <xgboost/generic_parameters.h>

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
 }

 namespace xgboost {
-Metric* Metric::Create(const std::string& name) {
+Metric* Metric::Create(const std::string& name, LearnerTrainParam const* tparam) {
  std::string buf = name;
  std::string prefix = name;
  auto pos = buf.find('@');
@@ -22,21 +21,24 @@ Metric* Metric::Create(const std::string& name) {
    if (e == nullptr) {
      LOG(FATAL) << "Unknown metric function " << name;
    }
-    return (e->body)(nullptr);
+    auto p_metric = (e->body)(nullptr);
+    p_metric->tparam_ = tparam;
+    return p_metric;
  } else {
    std::string prefix = buf.substr(0, pos);
    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
    if (e == nullptr) {
      LOG(FATAL) << "Unknown metric function " << name;
    }
-    return (e->body)(buf.substr(pos + 1, buf.length()).c_str());
+    auto p_metric = (e->body)(buf.substr(pos + 1, buf.length()).c_str());
+    p_metric->tparam_ = tparam;
+    return p_metric;
  }
 }
 }  // namespace xgboost

 namespace xgboost {
 namespace metric {
-DMLC_REGISTER_PARAMETER(MetricParam);

 // List of files that will be force linked in static links.
 DMLC_REGISTRY_LINK_TAG(elementwise_metric);
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -11,20 +11,6 @@
 namespace xgboost {
 namespace metric {

-// Created exclusively for GPU.
-struct MetricParam : public dmlc::Parameter<MetricParam> {
-  int n_gpus;
-  int gpu_id;
-  DMLC_DECLARE_PARAMETER(MetricParam) {
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-        .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for objective function evaluation");
-  };
-};
-
 class PackedReduceResult {
  double residue_sum_;
  double weights_sum_;
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -168,11 +168,6 @@ class MultiClassMetricsReduction {
 */
 template<typename Derived>
 struct EvalMClassBase : public Metric {
-  void Configure(
-      const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.InitAllowUnknown(args);
-  }
-
  bst_float Eval(const HostDeviceVector<bst_float> &preds,
                 const MetaInfo &info,
                 bool distributed) override {
@@ -185,7 +180,7 @@ struct EvalMClassBase : public Metric {
        << " use logloss for binary classification";
    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());

-    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
+    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
    auto result = reducer_.Reduce(devices, nclass, info.weights_, info.labels_, preds);
    double dat[2] { result.Residue(), result.Weights() };

@@ -215,7 +210,6 @@ struct EvalMClassBase : public Metric {

 private:
  MultiClassMetricsReduction<Derived> reducer_;
-  MetricParam param_;
  // used to store error message
  const char *error_msg_;
 };
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -18,29 +18,12 @@ namespace obj {
 DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)

-struct HingeObjParam : public dmlc::Parameter<HingeObjParam> {
-  int n_gpus;
-  int gpu_id;
-  DMLC_DECLARE_PARAMETER(HingeObjParam) {
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-        .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for objective function evaluation");
-  }
-};
-
 class HingeObj : public ObjFunction {
 public:
  HingeObj() = default;

  void Configure(
-      const std::vector<std::pair<std::string, std::string> > &args) override {
-    param_.InitAllowUnknown(args);
-    devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
-    label_correct_.Resize(devices_.IsEmpty() ? 1 : devices_.Size());
-  }
+      const std::vector<std::pair<std::string, std::string> > &args) override {}

  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
@@ -57,7 +40,6 @@ class HingeObj : public ObjFunction {
    out_gpair->Resize(ndata);
    common::Transform<>::Init(
        [=] XGBOOST_DEVICE(size_t _idx,
-                           common::Span<int> _label_correct,
                           common::Span<GradientPair> _out_gpair,
                           common::Span<const bst_float> _preds,
                           common::Span<const bst_float> _labels,
@@ -75,8 +57,9 @@ class HingeObj : public ObjFunction {
          }
          _out_gpair[_idx] = GradientPair(g, h);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices_).Eval(
-            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
+        common::Range{0, static_cast<int64_t>(ndata)},
+        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata)).Eval(
+            out_gpair, &preds, &info.labels_, &info.weights_);
  }

  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -84,22 +67,16 @@ class HingeObj : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
          _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
        },
-        common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, devices_)
+        common::Range{0, static_cast<int64_t>(io_preds->Size()), 1},
+        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
        .Eval(io_preds);
  }

  const char* DefaultEvalMetric() const override {
    return "error";
  }
-
- private:
-  GPUSet devices_;
-  HostDeviceVector<int> label_correct_;
-  HingeObjParam param_;
 };

-// register the objective functions
-DMLC_REGISTER_PARAMETER(HingeObjParam);
 // register the objective functions
 XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
 .describe("Hinge loss. Expects labels to be in [0,1f]")
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -13,6 +13,8 @@
 #include <algorithm>
 #include <limits>
 #include <utility>
+
+#include "../common/common.h"
 #include "../common/math.h"
 #include "../common/transform.h"

@@ -25,18 +27,10 @@ DMLC_REGISTRY_FILE_TAG(multiclass_obj_gpu);

 struct SoftmaxMultiClassParam : public dmlc::Parameter<SoftmaxMultiClassParam> {
  int num_class;
-  int n_gpus;
-  int gpu_id;
  // declare parameters
  DMLC_DECLARE_PARAMETER(SoftmaxMultiClassParam) {
    DMLC_DECLARE_FIELD(num_class).set_lower_bound(1)
        .describe("Number of output class in the multi-class classification.");
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-        .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for objective function evaluation");
  }
 };
 // TODO(trivialfis): Currently the sharding in softmax is less than ideal
@@ -49,8 +43,6 @@ class SoftmaxMultiClassObj : public ObjFunction {
  }
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
-    label_correct_.Resize(devices_.IsEmpty() ? 1 : devices_.Size());
  }
  void GetGradient(const HostDeviceVector<bst_float>& preds,
                   const MetaInfo& info,
@@ -63,11 +55,14 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(preds.Size() / nclass);

-    out_gpair->Shard(GPUDistribution::Granular(devices_, nclass));
-    info.labels_.Shard(GPUDistribution::Block(devices_));
-    info.weights_.Shard(GPUDistribution::Block(devices_));
-    preds.Shard(GPUDistribution::Granular(devices_, nclass));
-    label_correct_.Shard(GPUDistribution::Block(devices_));
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    out_gpair->Shard(GPUDistribution::Granular(devices, nclass));
+    info.labels_.Shard(GPUDistribution::Block(devices));
+    info.weights_.Shard(GPUDistribution::Block(devices));
+    preds.Shard(GPUDistribution::Granular(devices, nclass));
+
+    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Shard(GPUDistribution::Block(devices));

    out_gpair->Resize(preds.Size());
    label_correct_.Fill(1);
@@ -101,7 +96,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
            p = label == k ? p - 1.0f : p;
            gpair[idx * nclass + k] = GradientPair(p * wt, h);
          }
-        }, common::Range{0, ndata}, devices_, false)
+        }, common::Range{0, ndata}, devices, false)
        .Eval(out_gpair, &info.labels_, &preds, &info.weights_, &label_correct_);

    std::vector<int>& label_correct_h = label_correct_.HostVector();
@@ -126,6 +121,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
    max_preds_.Resize(ndata);

+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size());
    if (prob) {
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
@@ -133,11 +129,11 @@ class SoftmaxMultiClassObj : public ObjFunction {
                _preds.subspan(_idx * nclass, nclass);
            common::Softmax(point.begin(), point.end());
          },
-          common::Range{0, ndata}, GPUDistribution::Granular(devices_, nclass))
+          common::Range{0, ndata}, GPUDistribution::Granular(devices, nclass))
        .Eval(io_preds);
    } else {
-      io_preds->Shard(GPUDistribution::Granular(devices_, nclass));
-      max_preds_.Shard(GPUDistribution::Block(devices_));
+      io_preds->Shard(GPUDistribution::Granular(devices, nclass));
+      max_preds_.Shard(GPUDistribution::Block(devices));
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx,
                             common::Span<const bst_float> _preds,
@@ -148,7 +144,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
                common::FindMaxIndex(point.cbegin(),
                                     point.cend()) - point.cbegin();
          },
-          common::Range{0, ndata}, devices_, false)
+          common::Range{0, ndata}, devices, false)
        .Eval(io_preds, &max_preds_);
    }
    if (!prob) {
@@ -162,7 +158,6 @@ class SoftmaxMultiClassObj : public ObjFunction {
  bool output_prob_;
  // parameter
  SoftmaxMultiClassParam param_;
-  GPUSet devices_;
  // Cache for max_preds
  HostDeviceVector<bst_float> max_preds_;
  HostDeviceVector<int> label_correct_;
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -14,7 +14,7 @@ DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg);

 namespace xgboost {
 // implement factory functions
-ObjFunction* ObjFunction::Create(const std::string& name) {
+ObjFunction* ObjFunction::Create(const std::string& name, LearnerTrainParam const* tparam) {
  auto *e = ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->Find(name);
  if (e == nullptr) {
    for (const auto& entry : ::dmlc::Registry< ::xgboost::ObjFunctionReg>::List()) {
@@ -22,7 +22,9 @@ ObjFunction* ObjFunction::Create(const std::string& name) {
    }
    LOG(FATAL) << "Unknown objective function " << name;
  }
-  return (e->body)();
+  auto pobj = (e->body)();
+  pobj->tparam_ = tparam;
+  return pobj;
 }

 }  // namespace xgboost
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -28,18 +28,10 @@ DMLC_REGISTRY_FILE_TAG(regression_obj_gpu);

 struct RegLossParam : public dmlc::Parameter<RegLossParam> {
  float scale_pos_weight;
-  int n_gpus;
-  int gpu_id;
  // declare parameters
  DMLC_DECLARE_PARAMETER(RegLossParam) {
    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
      .describe("Scale the weight of positive examples by this factor");
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-      .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-      .set_lower_bound(0)
-      .set_default(0)
-      .describe("gpu to use for objective function evaluation");
  }
 };

@@ -53,8 +45,6 @@ class RegLossObj : public ObjFunction {

  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
-    label_correct_.Resize(devices_.IsEmpty() ? 1 : devices_.Size());
  }

  void GetGradient(const HostDeviceVector<bst_float>& preds,
@@ -67,6 +57,8 @@ class RegLossObj : public ObjFunction {
        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size();
    size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
    label_correct_.Fill(1);

    bool is_null_weight = info.weights_.Size() == 0;
@@ -91,7 +83,7 @@ class RegLossObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
                                          Loss::SecondOrderGradient(p, label) * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices_).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);

    // copy "label correct" flags back to host
@@ -113,7 +105,8 @@ class RegLossObj : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<float> _preds) {
          _preds[_idx] = Loss::PredTransform(_preds[_idx]);
        }, common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        devices_).Eval(io_preds);
+        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        .Eval(io_preds);
  }

  float ProbToMargin(float base_score) const override {
@@ -122,7 +115,6 @@ class RegLossObj : public ObjFunction {

 protected:
  RegLossParam param_;
-  GPUSet devices_;
 };

 // register the objective functions
@@ -181,18 +173,10 @@ XGBOOST_REGISTER_OBJECTIVE(GPULogisticRaw, "gpu:binary:logitraw")
 // declare parameter
 struct PoissonRegressionParam : public dmlc::Parameter<PoissonRegressionParam> {
  float max_delta_step;
-  int n_gpus;
-  int gpu_id;
  DMLC_DECLARE_PARAMETER(PoissonRegressionParam) {
    DMLC_DECLARE_FIELD(max_delta_step).set_lower_bound(0.0f).set_default(0.7f)
        .describe("Maximum delta step we allow each weight estimation to be." \
                  " This parameter is required for possion regression.");
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-        .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for objective function evaluation");
  }
 };

@@ -202,8 +186,6 @@ class PoissonRegression : public ObjFunction {
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
-    label_correct_.Resize(devices_.IsEmpty() ? 1 : devices_.Size());
  }

  void GetGradient(const HostDeviceVector<bst_float>& preds,
@@ -214,6 +196,8 @@ class PoissonRegression : public ObjFunction {
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
    label_correct_.Fill(1);

    bool is_null_weight = info.weights_.Size() == 0;
@@ -234,7 +218,7 @@ class PoissonRegression : public ObjFunction {
          _out_gpair[_idx] = GradientPair{(expf(p) - y) * w,
                                          expf(p + max_delta_step) * w};
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices_).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
    // copy "label correct" flags back to host
    std::vector<int>& label_correct_h = label_correct_.HostVector();
@@ -249,7 +233,8 @@ class PoissonRegression : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
          _preds[_idx] = expf(_preds[_idx]);
        },
-        common::Range{0, static_cast<int64_t>(io_preds->Size())}, devices_)
+        common::Range{0, static_cast<int64_t>(io_preds->Size())},
+        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -263,7 +248,6 @@ class PoissonRegression : public ObjFunction {
  }

 private:
-  GPUSet devices_;
  PoissonRegressionParam param_;
  HostDeviceVector<int> label_correct_;
 };
@@ -279,8 +263,9 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
 // cox regression for survival data (negative values mean they are censored)
 class CoxRegression : public ObjFunction {
 public:
-  // declare functions
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {}
+  void Configure(
+      const std::vector<std::pair<std::string, std::string> > &args) override {}
+
  void GetGradient(const HostDeviceVector<bst_float>& preds,
                   const MetaInfo &info,
                   int iter,
@@ -363,29 +348,11 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 .describe("Cox regression for censored survival data (negative labels are considered censored).")
 .set_body([]() { return new CoxRegression(); });

-
-struct GammaRegressionParam : public dmlc::Parameter<GammaRegressionParam> {
-  int n_gpus;
-  int gpu_id;
-  DMLC_DECLARE_PARAMETER(GammaRegressionParam) {
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-        .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for objective function evaluation");
-  }
-};
-
 // gamma regression
 class GammaRegression : public ObjFunction {
 public:
-  // declare functions
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.InitAllowUnknown(args);
-    devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
-    label_correct_.Resize(devices_.IsEmpty() ? 1 : devices_.Size());
-  }
+  void Configure(
+      const std::vector<std::pair<std::string, std::string> > &args) override {}

  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
@@ -394,7 +361,9 @@ class GammaRegression : public ObjFunction {
    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    const size_t ndata = preds.Size();
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
    out_gpair->Resize(ndata);
+    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
    label_correct_.Fill(1);

    const bool is_null_weight = info.weights_.Size() == 0;
@@ -413,7 +382,7 @@ class GammaRegression : public ObjFunction {
          }
          _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices_).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);

    // copy "label correct" flags back to host
@@ -429,7 +398,8 @@ class GammaRegression : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
          _preds[_idx] = expf(_preds[_idx]);
        },
-        common::Range{0, static_cast<int64_t>(io_preds->Size())}, devices_)
+        common::Range{0, static_cast<int64_t>(io_preds->Size())},
+        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -443,13 +413,9 @@ class GammaRegression : public ObjFunction {
  }

 private:
-  GPUSet devices_;
-  GammaRegressionParam param_;
  HostDeviceVector<int> label_correct_;
 };

-// register the objective functions
-DMLC_REGISTER_PARAMETER(GammaRegressionParam);
 // register the objective functions
 XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma")
 .describe("Gamma regression for severity data.")
@@ -459,17 +425,9 @@ XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma")
 // declare parameter
 struct TweedieRegressionParam : public dmlc::Parameter<TweedieRegressionParam> {
  float tweedie_variance_power;
-  int n_gpus;
-  int gpu_id;
  DMLC_DECLARE_PARAMETER(TweedieRegressionParam) {
    DMLC_DECLARE_FIELD(tweedie_variance_power).set_range(1.0f, 2.0f).set_default(1.5f)
      .describe("Tweedie variance power.  Must be between in range [1, 2).");
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).set_lower_bound(GPUSet::kAll)
-        .describe("Number of GPUs to use for multi-gpu algorithms.");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for objective function evaluation");
  }
 };

@@ -479,8 +437,6 @@ class TweedieRegression : public ObjFunction {
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
-    label_correct_.Resize(devices_.IsEmpty() ? 1 : devices_.Size());
  }

  void GetGradient(const HostDeviceVector<bst_float>& preds,
@@ -491,6 +447,9 @@ class TweedieRegression : public ObjFunction {
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    const size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
+
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
    label_correct_.Fill(1);

    const bool is_null_weight = info.weights_.Size() == 0;
@@ -514,7 +473,7 @@ class TweedieRegression : public ObjFunction {
              std::exp((1 - rho) * p) + (2 - rho) * expf((2 - rho) * p);
          _out_gpair[_idx] = GradientPair(grad * w, hess * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata), 1}, devices_)
+        common::Range{0, static_cast<int64_t>(ndata), 1}, devices)
        .Eval(&label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);

    // copy "label correct" flags back to host
@@ -530,7 +489,8 @@ class TweedieRegression : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
          _preds[_idx] = expf(_preds[_idx]);
        },
-        common::Range{0, static_cast<int64_t>(io_preds->Size())}, devices_)
+        common::Range{0, static_cast<int64_t>(io_preds->Size())},
+        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
        .Eval(io_preds);
  }

@@ -546,7 +506,6 @@ class TweedieRegression : public ObjFunction {
  }

 private:
-  GPUSet devices_;
  TweedieRegressionParam param_;
  HostDeviceVector<int> label_correct_;
 };
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -82,7 +82,7 @@ class CPUPredictor : public Predictor {
      for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
        RegTree::FVec& feats = thread_temp[0];
        const auto ridx = static_cast<int64_t>(batch.base_rowid + i);
-         auto inst = batch[i];
+        auto inst = batch[i];
        for (int gid = 0; gid < num_group; ++gid) {
          const size_t offset = ridx * num_group + gid;
          preds[offset] +=
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -20,20 +20,6 @@ namespace predictor {

 DMLC_REGISTRY_FILE_TAG(gpu_predictor);

-/*! \brief prediction parameters */
-struct GPUPredictionParam : public dmlc::Parameter<GPUPredictionParam> {
-  int gpu_id;
-  int n_gpus;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(GPUPredictionParam) {
-    DMLC_DECLARE_FIELD(gpu_id).set_lower_bound(0).set_default(0).describe(
-        "Device ordinal for GPU prediction.");
-    DMLC_DECLARE_FIELD(n_gpus).set_lower_bound(-1).set_default(1).describe(
-        "Number of devices to use for prediction.");
-  }
-};
-DMLC_REGISTER_PARAMETER(GPUPredictionParam);
-
 template <typename IterT>
 void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) {
  thrust::transform(begin_itr, end_itr, begin_itr,
@@ -387,14 +373,15 @@ class GPUPredictor : public xgboost::Predictor {
  }

 public:
-  GPUPredictor()                                               // NOLINT
-      : cpu_predictor_(Predictor::Create("cpu_predictor")) {}  // NOLINT
+  GPUPredictor()
+      : cpu_predictor_(Predictor::Create("cpu_predictor", learner_param_)) {}

  void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                    const gbm::GBTreeModel& model, int tree_begin,
                    unsigned ntree_limit = 0) override {
-    GPUSet devices = GPUSet::All(
-        param_.gpu_id, param_.n_gpus, dmat->Info().num_row_);
+    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
+                                 dmat->Info().num_row_);
+    CHECK_NE(devices.Size(), 0);
    ConfigureShards(devices);

    if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
@@ -508,9 +495,8 @@ class GPUPredictor : public xgboost::Predictor {
            const std::vector<std::shared_ptr<DMatrix>>& cache) override {
    Predictor::Init(cfg, cache);
    cpu_predictor_->Init(cfg, cache);
-    param_.InitAllowUnknown(cfg);

-    GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus);
+    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus);
    ConfigureShards(devices);
  }

@@ -527,7 +513,6 @@ class GPUPredictor : public xgboost::Predictor {
      });
  }

-  GPUPredictionParam param_;
  std::unique_ptr<Predictor> cpu_predictor_;
  std::vector<DeviceShard> shards_;
  GPUSet devices_;
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -15,12 +15,14 @@ void Predictor::Init(
    cache_[d.get()].data = d;
  }
 }
-Predictor* Predictor::Create(std::string name) {
+Predictor* Predictor::Create(std::string const& name, LearnerTrainParam const* learner_param) {
  auto* e = ::dmlc::Registry<PredictorReg>::Get()->Find(name);
  if (e == nullptr) {
    LOG(FATAL) << "Unknown predictor type " << name;
  }
-  return (e->body)();
+  auto p_predictor =  (e->body)();
+  p_predictor->learner_param_ = learner_param;
+  return p_predictor;
 }
 }  // namespace xgboost

--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -72,10 +72,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  bool refresh_leaf;
  // auxiliary data structure
  std::vector<int> monotone_constraints;
-  // gpu to use for single gpu algorithms
-  int gpu_id;
-  // number of GPUs to use
-  int n_gpus;
  // the criteria to use for ranking splits
  std::string split_evaluator;

@@ -191,14 +187,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
    DMLC_DECLARE_FIELD(monotone_constraints)
        .set_default(std::vector<int>())
        .describe("Constraint of variable monotonicity");
-    DMLC_DECLARE_FIELD(gpu_id)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("gpu to use for single gpu algorithms");
-    DMLC_DECLARE_FIELD(n_gpus)
-        .set_lower_bound(-1)
-        .set_default(1)
-        .describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
    DMLC_DECLARE_FIELD(split_evaluator)
        .set_default("elastic_net,monotonic,interaction")
        .describe("The criteria to use for ranking splits");
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@@ -14,12 +14,14 @@ DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);

 namespace xgboost {

-TreeUpdater* TreeUpdater::Create(const std::string& name) {
+TreeUpdater* TreeUpdater::Create(const std::string& name, LearnerTrainParam const* tparam) {
  auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name);
  if (e == nullptr) {
    LOG(FATAL) << "Unknown tree updater " << name;
  }
-  return (e->body)();
+  auto p_updater = (e->body)();
+  p_updater->tparam_ = tparam;
+  return p_updater;
 }

 }  // namespace xgboost
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -769,7 +769,7 @@ class DistColMaker : public ColMaker {
 public:
  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    pruner_.reset(TreeUpdater::Create("prune"));
+    pruner_.reset(TreeUpdater::Create("prune", tparam_));
    pruner_->Init(args);
    spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
    spliteval_->Init(args);
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -443,9 +443,10 @@ void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
                 common::Span<const DeviceNodeStats> nodes,
                 int nUniqKeys,
                 NodeIdT nodeStart, int len, const TrainParam param,
-                 ArgMaxByKeyAlgo algo) {
+                 ArgMaxByKeyAlgo algo,
+                 GPUSet const& devices) {
  dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
-      param.gpu_id, nodeSplits.data(), nUniqKeys,
+      *(devices.begin()), nodeSplits.data(), nUniqKeys,
      ExactSplitCandidate());
  int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
  switch (algo) {
@@ -585,7 +586,7 @@ class GPUMaker : public TreeUpdater {
     maxNodes_ = (1 << (param_.max_depth + 1)) - 1;
     maxLeaves_ = 1 << param_.max_depth;

-     devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
+     devices_ = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
  }

  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
@@ -641,7 +642,7 @@ class GPUMaker : public TreeUpdater {
    float min_split_loss = param_.min_split_loss;
    auto gpu_param = GPUTrainingParam(param_);

-    dh::LaunchN(param_.gpu_id, nNodes, [=] __device__(int uid) {
+    dh::LaunchN(*(devices_.begin()), nNodes, [=] __device__(int uid) {
      int absNodeId = uid + nodeStart;
      ExactSplitCandidate s = d_nodeSplits[uid];
      if (s.IsSplittable(min_split_loss)) {
@@ -683,16 +684,18 @@ class GPUMaker : public TreeUpdater {
                    instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
                    n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_,
                    colIds_, nodeStart);
+    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
    ArgMaxByKey(nodeSplits_, gradscans_, gradsums_,
                vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(),
                nodes_, nNodes, nodeStart, n_vals_, param_,
-                level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
+                level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem,
+                devices);
    Split2Node(nNodes, nodeStart);
  }

  void AllocateAllData(int offsetSize) {
    int tmpBuffSize = ScanTempBufferSize(n_vals_);
-    ba_.Allocate(param_.gpu_id, &vals_, n_vals_,
+    ba_.Allocate(*(devices_.begin()), &vals_, n_vals_,
                 &vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_,
                 &colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_,
                 &nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_,
@@ -783,7 +786,7 @@ class GPUMaker : public TreeUpdater {
      auto d_nodes = nodes_;
      auto d_sums = gradsums_;
      auto gpu_params = GPUTrainingParam(param_);
-      dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) {
+      dh::LaunchN(*(devices_.begin()), 1, [=] __device__(int idx) {
        d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
      });
    } else {
@@ -800,7 +803,7 @@ class GPUMaker : public TreeUpdater {
          nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
          colOffsets_.data(), vals_.Current(), n_vals_, n_cols_);
      // gather the node assignments across all other columns too
-      dh::Gather(param_.gpu_id, nodeAssigns_.Current(),
+      dh::Gather(*(devices_.begin()), nodeAssigns_.Current(),
                 node_assigns_per_inst_.data(), instIds_.Current(), n_vals_);
      SortKeys(level);
    }
@@ -811,7 +814,7 @@ class GPUMaker : public TreeUpdater {
    // but we don't need more than level+1 bits for sorting!
    SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_,
                  colOffsets_, 0, level + 1);
-    dh::Gather<float, int>(param_.gpu_id, vals_.other(),
+    dh::Gather<float, int>(*(devices_.begin()), vals_.other(),
                           vals_.Current(), instIds_.other(), instIds_.Current(),
                           nodeLocations_.Current(), n_vals_);
    vals_.buff.selector ^= 1;
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2018 XGBoost contributors
+ * Copyright 2017-2019 XGBoost contributors
 */
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -322,7 +322,7 @@ __global__ void EvaluateSplitKernel(
        node_histogram,               // histogram for gradients
    common::Span<const int> feature_set,  // Selected features
    DeviceNodeStats node,
-  ELLPackMatrix matrix,
+    ELLPackMatrix matrix,
    GPUTrainingParam gpu_param,
    common::Span<DeviceSplitCandidate> split_candidates,  // resulting split
    ValueConstraint value_constraint,
@@ -1377,13 +1377,16 @@ template <typename GradientSumT>
 class GPUHistMakerSpecialised{
 public:
  GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
-  void Init(
-      const std::vector<std::pair<std::string, std::string>>& args) {
+  void Init(const std::vector<std::pair<std::string, std::string>>& args,
+            LearnerTrainParam const* lparam) {
    param_.InitAllowUnknown(args);
+    learner_param_ = lparam;
    hist_maker_param_.InitAllowUnknown(args);
-    CHECK(param_.n_gpus != 0) << "Must have at least one device";
-    n_devices_ = param_.n_gpus;
-    dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus));
+    auto devices = GPUSet::All(learner_param_->gpu_id,
+                               learner_param_->n_gpus);
+    n_devices_ = devices.Size();
+    CHECK(n_devices_ != 0) << "Must have at least one device";
+    dist_ = GPUDistribution::Block(devices);

    dh::CheckComputeCapability();

@@ -1446,7 +1449,8 @@ class GPUHistMakerSpecialised{

    // Find the cuts.
    monitor_.StartCuda("Quantiles");
-    common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
+    common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows,
+                         GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus));
    n_bins_ = hmat_.row_ptr.back();
    monitor_.StopCuda("Quantiles");
    auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
@@ -1552,6 +1556,7 @@ class GPUHistMakerSpecialised{
  int n_bins_;

  GPUHistMakerTrainParam hist_maker_param_;
+  LearnerTrainParam const* learner_param_;
  common::GHistIndexMatrix gmat_;

  dh::AllReducer reducer_;
@@ -1573,10 +1578,10 @@ class GPUHistMaker : public TreeUpdater {
    double_maker_.reset();
    if (hist_maker_param_.single_precision_histogram) {
      float_maker_.reset(new GPUHistMakerSpecialised<GradientPair>());
-      float_maker_->Init(args);
+      float_maker_->Init(args, tparam_);
    } else {
      double_maker_.reset(new GPUHistMakerSpecialised<GradientPairPrecise>());
-      double_maker_->Init(args);
+      double_maker_->Init(args, tparam_);
    }
  }

--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -22,7 +22,7 @@ DMLC_REGISTRY_FILE_TAG(updater_prune);
 class TreePruner: public TreeUpdater {
 public:
  TreePruner() {
-    syncher_.reset(TreeUpdater::Create("sync"));
+    syncher_.reset(TreeUpdater::Create("sync", tparam_));
  }
  // set training parameter
  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -35,7 +35,7 @@ DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
 void QuantileHistMaker::Init(const std::vector<std::pair<std::string, std::string> >& args) {
  // initialize pruner
  if (!pruner_) {
-    pruner_.reset(TreeUpdater::Create("prune"));
+    pruner_.reset(TreeUpdater::Create("prune", tparam_));
  }
  pruner_->Init(args);
  param_.InitAllowUnknown(args);