Refactor fast-hist, add tests for some updaters. (#3836)

Add unittest for prune. Add unittest for refresh. Refactor fast_hist. * Remove fast_hist_param. * Rename to quantile_hist. Add unittests for QuantileHist. * Refactor QuantileHist into .h and .cc file. * Remove sync.h. * Remove MGPU_mock test. Rename fast hist method to quantile hist.
2018-11-07 21:15:07 +13:00 · 2018-11-07 21:15:07 +13:00 · 19ee0a3579
commit 19ee0a3579
parent 2b045aa805
30 changed files with 1366 additions and 983 deletions
--- a/amalgamation/xgboost-all0.cc
+++ b/amalgamation/xgboost-all0.cc
@ -48,7 +48,7 @@
 #include "../src/tree/tree_model.cc"
 #include "../src/tree/tree_updater.cc"
 #include "../src/tree/updater_colmaker.cc"
-#include "../src/tree/updater_fast_hist.cc"
+#include "../src/tree/updater_quantile_hist.cc"
 #include "../src/tree/updater_prune.cc"
 #include "../src/tree/updater_refresh.cc"
 #include "../src/tree/updater_sync.cc"
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@ -19,7 +19,6 @@
 #include <cstdio>
 #include <cstring>
 #include <vector>
-#include "./common/sync.h"
 #include "./common/config.h"


--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -4,10 +4,11 @@
 * \brief Utilities to store histograms
 * \author Philip Cho, Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <dmlc/omp.h>
 #include <numeric>
 #include <vector>
-#include "./sync.h"
+
 #include "./random.h"
 #include "./column_matrix.h"
 #include "./hist_util.h"
@ -216,7 +217,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
           const std::vector<size_t>& feature_nnz,
           const ColumnMatrix& colmat,
           size_t nrow,
-           const FastHistParam& param) {
+           const tree::TrainParam& param) {
  /* Goal: Bundle features together that has little or no "overlap", i.e.
           only a few data points should have nonzero values for
           member features.
@ -278,7 +279,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
 inline std::vector<std::vector<unsigned>>
 FastFeatureGrouping(const GHistIndexMatrix& gmat,
                    const ColumnMatrix& colmat,
-                    const FastHistParam& param) {
+                    const tree::TrainParam& param) {
  const size_t nrow = gmat.row_ptr.size() - 1;
  const size_t nfeature = gmat.cut.row_ptr.size() - 1;

@ -332,7 +333,7 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,

 void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
                                 const ColumnMatrix& colmat,
-                                 const FastHistParam& param) {
+                                 const tree::TrainParam& param) {
  cut_ = &gmat.cut;

  const size_t nrow = gmat.row_ptr.size() - 1;
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@ -11,7 +11,6 @@
 #include <limits>
 #include <vector>
 #include "row_set.h"
-#include "../tree/fast_hist_param.h"
 #include "../tree/param.h"
 #include "./quantile.h"

@ -19,8 +18,6 @@ namespace xgboost {

 namespace common {

-using tree::FastHistParam;
-
 /*! \brief sums of gradient statistics corresponding to a histogram bin */
 struct GHistEntry {
  /*! \brief sum of first-order gradient statistics */
@ -145,7 +142,7 @@ class GHistIndexBlockMatrix {
 public:
  void Init(const GHistIndexMatrix& gmat,
            const ColumnMatrix& colmat,
-            const FastHistParam& param);
+            const tree::TrainParam& param);

  inline GHistIndexBlock operator[](size_t i) const {
    return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
--- a/src/common/io.h
+++ b/src/common/io.h
@ -9,9 +9,9 @@
 #define XGBOOST_COMMON_IO_H_

 #include <dmlc/io.h>
+#include <rabit/rabit.h>
 #include <string>
 #include <cstring>
-#include "./sync.h"

 namespace xgboost {
 namespace common {
--- a/src/common/sync.h
+++ b/src/common/sync.h
@ -1,13 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file sync.h
- * \brief the synchronization module of rabit
- *        redirects to rabit header
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_COMMON_SYNC_H_
-#define XGBOOST_COMMON_SYNC_H_
-
-#include <rabit/rabit.h>
-
-#endif  // XGBOOST_COMMON_SYNC_H_
--- a/src/learner.cc
+++ b/src/learner.cc
@ -211,8 +211,8 @@ class LearnerImpl : public Learner {
      break;
     case TreeMethod::kHist:
      LOG(CONSOLE) << "Tree method is selected to be 'hist', which uses a "
-                      "single updater grow_fast_histmaker.";
-      cfg_["updater"] = "grow_fast_histmaker";
+                      "single updater grow_quantile_histmaker.";
+      cfg_["updater"] = "grow_quantile_histmaker";
      break;
     case TreeMethod::kGPUExact:
      this->AssertGPUSupport();
--- a/src/logging.cc
+++ b/src/logging.cc
@ -4,9 +4,9 @@
 * \brief Implementation of loggers.
 * \author Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <xgboost/logging.h>
 #include <iostream>
-#include "./common/sync.h"

 #if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
 // Override logging mechanism for non-R interfaces
--- a/src/metric/elementwise_metric.cc
+++ b/src/metric/elementwise_metric.cc
@ -4,11 +4,11 @@
 * \brief evaluation metrics for elementwise binary or regression.
 * \author Kailong Chen, Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <xgboost/metric.h>
 #include <dmlc/registry.h>
 #include <cmath>
 #include "../common/math.h"
-#include "../common/sync.h"

 namespace xgboost {
 namespace metric {
--- a/src/metric/multiclass_metric.cc
+++ b/src/metric/multiclass_metric.cc
@ -4,9 +4,9 @@
 * \brief evaluation metrics for multiclass classification.
 * \author Kailong Chen, Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <xgboost/metric.h>
 #include <cmath>
-#include "../common/sync.h"
 #include "../common/math.h"

 namespace xgboost {
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@ -4,10 +4,10 @@
 * \brief prediction rank based metrics.
 * \author Kailong Chen, Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <xgboost/metric.h>
 #include <dmlc/registry.h>
 #include <cmath>
-#include "../common/sync.h"
 #include "../common/math.h"

 namespace xgboost {
--- a/src/tree/fast_hist_param.h
+++ b/src/tree/fast_hist_param.h
@ -1,54 +0,0 @@
-/*!
- * Copyright 2017 by Contributors
- * \file updater_fast_hist.h
- * \brief parameters for histogram-based training
- * \author Philip Cho, Tianqi Chen
- */
-#ifndef XGBOOST_TREE_FAST_HIST_PARAM_H_
-#define XGBOOST_TREE_FAST_HIST_PARAM_H_
-
-namespace xgboost {
-namespace tree {
-
-/*! \brief training parameters for histogram-based training */
-struct FastHistParam : public dmlc::Parameter<FastHistParam> {
-  int colmat_dtype;
-  // percentage threshold for treating a feature as sparse
-  // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
-  double sparse_threshold;
-  // use feature grouping? (default yes)
-  int enable_feature_grouping;
-  // when grouping features, how many "conflicts" to allow.
-  // conflict is when an instance has nonzero values for two or more features
-  // default is 0, meaning features should be strictly complementary
-  double max_conflict_rate;
-  // when grouping features, how much effort to expend to prevent singleton groups
-  // we'll try to insert each feature into existing groups before creating a new group
-  // for that feature; to save time, only up to (max_search_group) of existing groups
-  // will be considered. If set to zero, ALL existing groups will be examined
-  unsigned max_search_group;
-
-  // declare the parameters
-  DMLC_DECLARE_PARAMETER(FastHistParam) {
-    DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
-        .describe("percentage threshold for treating a feature as sparse");
-    DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
-        .describe("if >0, enable feature grouping to ameliorate work imbalance "
-                  "among worker threads");
-    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
-        .describe("when grouping features, how many \"conflicts\" to allow."
-       "conflict is when an instance has nonzero values for two or more features."
-       "default is 0, meaning features should be strictly complementary.");
-    DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
-        .describe("when grouping features, how much effort to expend to prevent "
-                  "singleton groups. We'll try to insert each feature into existing "
-                  "groups before creating a new group for that feature; to save time, "
-                  "only up to (max_search_group) of existing groups will be "
-                  "considered. If set to zero, ALL existing groups will be examined.");
-  }
-};
-
-}  // namespace tree
-}  // namespace xgboost
-
-#endif  // XGBOOST_TREE_FAST_HIST_PARAM_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@ -81,6 +81,23 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  int gpu_batch_nrows;
  // the criteria to use for ranking splits
  std::string split_evaluator;
+
+  // ------ From cpu quantile histogram -------.
+  // percentage threshold for treating a feature as sparse
+  // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
+  double sparse_threshold;
+  // use feature grouping? (default yes)
+  int enable_feature_grouping;
+  // when grouping features, how many "conflicts" to allow.
+  // conflict is when an instance has nonzero values for two or more features
+  // default is 0, meaning features should be strictly complementary
+  double max_conflict_rate;
+  // when grouping features, how much effort to expend to prevent singleton groups
+  // we'll try to insert each feature into existing groups before creating a new group
+  // for that feature; to save time, only up to (max_search_group) of existing groups
+  // will be considered. If set to zero, ALL existing groups will be examined
+  unsigned max_search_group;
+
  // declare the parameters
  DMLC_DECLARE_PARAMETER(TrainParam) {
    DMLC_DECLARE_FIELD(learning_rate)
@ -196,6 +213,24 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
    DMLC_DECLARE_FIELD(split_evaluator)
        .set_default("elastic_net,monotonic,interaction")
        .describe("The criteria to use for ranking splits");
+
+    // ------ From cpu quantile histogram -------.
+    DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
+        .describe("percentage threshold for treating a feature as sparse");
+    DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
+        .describe("if >0, enable feature grouping to ameliorate work imbalance "
+                  "among worker threads");
+    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
+        .describe("when grouping features, how many \"conflicts\" to allow."
+       "conflict is when an instance has nonzero values for two or more features."
+       "default is 0, meaning features should be strictly complementary.");
+    DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
+        .describe("when grouping features, how much effort to expend to prevent "
+                  "singleton groups. We'll try to insert each feature into existing "
+                  "groups before creating a new group for that feature; to save time, "
+                  "only up to (max_search_group) of existing groups will be "
+                  "considered. If set to zero, ALL existing groups will be examined.");
+
    // add alias of parameters
    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@ -31,7 +31,7 @@ DMLC_REGISTRY_LINK_TAG(updater_colmaker);
 DMLC_REGISTRY_LINK_TAG(updater_skmaker);
 DMLC_REGISTRY_LINK_TAG(updater_refresh);
 DMLC_REGISTRY_LINK_TAG(updater_prune);
-DMLC_REGISTRY_LINK_TAG(updater_fast_hist);
+DMLC_REGISTRY_LINK_TAG(updater_quantile_hist);
 DMLC_REGISTRY_LINK_TAG(updater_histmaker);
 DMLC_REGISTRY_LINK_TAG(updater_sync);
 #ifdef XGBOOST_USE_CUDA
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@ -7,6 +7,8 @@
 #ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
 #define XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_

+#include <rabit/rabit.h>
+
 #include <xgboost/base.h>
 #include <xgboost/tree_updater.h>
 #include <vector>
@ -14,8 +16,8 @@
 #include <string>
 #include <limits>
 #include <utility>
+
 #include "./param.h"
-#include "../common/sync.h"
 #include "../common/io.h"
 #include "../common/random.h"
 #include "../common/quantile.h"
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@ -4,15 +4,16 @@
 * \brief use columnwise update to construct a tree
 * \author Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>
 #include <memory>
 #include <vector>
 #include <cmath>
 #include <algorithm>
+
 #include "./param.h"
 #include "../common/random.h"
 #include "../common/bitmap.h"
-#include "../common/sync.h"
 #include "split_evaluator.h"

 namespace xgboost {
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@ -1,873 +0,0 @@
-/*!
- * Copyright 2017 by Contributors
- * \file updater_fast_hist.cc
- * \brief use quantized feature values to construct a tree
- * \author Philip Cho, Tianqi Checn
- */
-#include <dmlc/timer.h>
-#include <xgboost/tree_updater.h>
-#include <cmath>
-#include <memory>
-#include <vector>
-#include <algorithm>
-#include <queue>
-#include <iomanip>
-#include <numeric>
-#include "./param.h"
-#include "./fast_hist_param.h"
-#include "./split_evaluator.h"
-#include "../common/random.h"
-#include "../common/bitmap.h"
-#include "../common/sync.h"
-#include "../common/hist_util.h"
-#include "../common/row_set.h"
-#include "../common/column_matrix.h"
-
-namespace xgboost {
-namespace tree {
-
-using xgboost::common::HistCutMatrix;
-using xgboost::common::GHistIndexMatrix;
-using xgboost::common::GHistIndexBlockMatrix;
-using xgboost::common::GHistIndexRow;
-using xgboost::common::GHistEntry;
-using xgboost::common::HistCollection;
-using xgboost::common::RowSetCollection;
-using xgboost::common::GHistRow;
-using xgboost::common::GHistBuilder;
-using xgboost::common::ColumnMatrix;
-using xgboost::common::Column;
-
-DMLC_REGISTRY_FILE_TAG(updater_fast_hist);
-
-DMLC_REGISTER_PARAMETER(FastHistParam);
-
-/*! \brief construct a tree using quantized feature values */
-class FastHistMaker: public TreeUpdater {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
-    // initialize pruner
-    if (!pruner_) {
-      pruner_.reset(TreeUpdater::Create("prune"));
-    }
-    pruner_->Init(args);
-    param_.InitAllowUnknown(args);
-    fhparam_.InitAllowUnknown(args);
-    is_gmat_initialized_ = false;
-
-    // initialise the split evaluator
-    if (!spliteval_) {
-      spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
-    }
-
-    spliteval_->Init(args);
-  }
-
-  void Update(HostDeviceVector<GradientPair>* gpair,
-              DMatrix* dmat,
-              const std::vector<RegTree*>& trees) override {
-    GradStats::CheckInfo(dmat->Info());
-    if (is_gmat_initialized_ == false) {
-      double tstart = dmlc::GetTime();
-      gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
-      column_matrix_.Init(gmat_, fhparam_.sparse_threshold);
-      if (fhparam_.enable_feature_grouping > 0) {
-        gmatb_.Init(gmat_, column_matrix_, fhparam_);
-      }
-      is_gmat_initialized_ = true;
-      if (param_.debug_verbose > 0) {
-        LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
-      }
-    }
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-    // build tree
-    if (!builder_) {
-      builder_.reset(new Builder(
-        param_,
-        fhparam_,
-        std::move(pruner_),
-        std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
-    }
-    for (auto tree : trees) {
-      builder_->Update
-        (gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
-    }
-    param_.learning_rate = lr;
-  }
-
-  bool UpdatePredictionCache(const DMatrix* data,
-                             HostDeviceVector<bst_float>* out_preds) override {
-    if (!builder_ || param_.subsample < 1.0f) {
-      return false;
-    } else {
-      return builder_->UpdatePredictionCache(data, out_preds);
-    }
-  }
-
- protected:
-  // training parameter
-  TrainParam param_;
-  FastHistParam fhparam_;
-  // quantized data matrix
-  GHistIndexMatrix gmat_;
-  // (optional) data matrix with feature grouping
-  GHistIndexBlockMatrix gmatb_;
-  // column accessor
-  ColumnMatrix column_matrix_;
-  bool is_gmat_initialized_;
-
-  // data structure
-  struct NodeEntry {
-    /*! \brief statics for node entry */
-    GradStats stats;
-    /*! \brief loss of this node, without split */
-    bst_float root_gain;
-    /*! \brief weight calculated related to current data */
-    float weight;
-    /*! \brief current best solution */
-    SplitEntry best;
-    // constructor
-    explicit NodeEntry(const TrainParam& param)
-        : stats(param), root_gain(0.0f), weight(0.0f) {
-    }
-  };
-  // actual builder that runs the algorithm
-
-  struct Builder {
-   public:
-    // constructor
-    explicit Builder(const TrainParam& param,
-                     const FastHistParam& fhparam,
-                     std::unique_ptr<TreeUpdater> pruner,
-                     std::unique_ptr<SplitEvaluator> spliteval)
-      : param_(param), fhparam_(fhparam), pruner_(std::move(pruner)),
-        spliteval_(std::move(spliteval)), p_last_tree_(nullptr),
-        p_last_fmat_(nullptr) {}
-    // update one tree, growing
-    virtual void Update(const GHistIndexMatrix& gmat,
-                        const GHistIndexBlockMatrix& gmatb,
-                        const ColumnMatrix& column_matrix,
-                        HostDeviceVector<GradientPair>* gpair,
-                        DMatrix* p_fmat,
-                        RegTree* p_tree) {
-      double gstart = dmlc::GetTime();
-
-      int num_leaves = 0;
-      unsigned timestamp = 0;
-
-      double tstart;
-      double time_init_data = 0;
-      double time_init_new_node = 0;
-      double time_build_hist = 0;
-      double time_evaluate_split = 0;
-      double time_apply_split = 0;
-
-      const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
-
-      spliteval_->Reset();
-
-      tstart = dmlc::GetTime();
-      this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
-      time_init_data = dmlc::GetTime() - tstart;
-
-      // FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
-      CHECK_EQ(p_tree->param.num_roots, 1)
-        << "tree_method=hist does not support multiple roots at this moment";
-      for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
-        tstart = dmlc::GetTime();
-        hist_.AddHistRow(nid);
-        BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid]);
-        time_build_hist += dmlc::GetTime() - tstart;
-
-        tstart = dmlc::GetTime();
-        this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
-        time_init_new_node += dmlc::GetTime() - tstart;
-
-        tstart = dmlc::GetTime();
-        this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
-        time_evaluate_split += dmlc::GetTime() - tstart;
-        qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
-                                   snode_[nid].best.loss_chg,
-                                   timestamp++));
-        ++num_leaves;
-      }
-
-      while (!qexpand_->empty()) {
-        const ExpandEntry candidate = qexpand_->top();
-        const int nid = candidate.nid;
-        qexpand_->pop();
-        if (candidate.loss_chg <= kRtEps
-            || (param_.max_depth > 0 && candidate.depth == param_.max_depth)
-            || (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) {
-          (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
-        } else {
-          tstart = dmlc::GetTime();
-          this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
-          time_apply_split += dmlc::GetTime() - tstart;
-
-          tstart = dmlc::GetTime();
-          const int cleft = (*p_tree)[nid].LeftChild();
-          const int cright = (*p_tree)[nid].RightChild();
-          hist_.AddHistRow(cleft);
-          hist_.AddHistRow(cright);
-          if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
-            BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
-            SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
-          } else {
-            BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
-            SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
-          }
-          time_build_hist += dmlc::GetTime() - tstart;
-
-          tstart = dmlc::GetTime();
-          this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
-          this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
-          bst_uint featureid = snode_[nid].best.SplitIndex();
-          spliteval_->AddSplit(nid, cleft, cright, featureid,
-              snode_[cleft].weight, snode_[cright].weight);
-          time_init_new_node += dmlc::GetTime() - tstart;
-
-          tstart = dmlc::GetTime();
-          this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree);
-          this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree);
-          time_evaluate_split += dmlc::GetTime() - tstart;
-
-          qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
-                                     snode_[cleft].best.loss_chg,
-                                     timestamp++));
-          qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
-                                     snode_[cright].best.loss_chg,
-                                     timestamp++));
-
-          ++num_leaves;  // give two and take one, as parent is no longer a leaf
-        }
-      }
-
-      // set all the rest expanding nodes to leaf
-      // This post condition is not needed in current code, but may be necessary
-      // when there are stopping rule that leaves qexpand non-empty
-      while (!qexpand_->empty()) {
-        const int nid = qexpand_->top().nid;
-        qexpand_->pop();
-        (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
-      }
-      // remember auxiliary statistics in the tree node
-      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
-        p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
-        p_tree->Stat(nid).base_weight = snode_[nid].weight;
-        p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
-        snode_[nid].stats.SetLeafVec(param_, p_tree->Leafvec(nid));
-      }
-
-      pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
-
-      if (param_.debug_verbose > 0) {
-        double total_time = dmlc::GetTime() - gstart;
-        LOG(INFO) << "\nInitData:          "
-                  << std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
-                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-                  << time_init_data / total_time * 100 << "%)\n"
-                  << "InitNewNode:       "
-                  << std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
-                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-                  << time_init_new_node / total_time * 100 << "%)\n"
-                  << "BuildHist:         "
-                  << std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
-                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-                  << time_build_hist / total_time * 100 << "%)\n"
-                  << "EvaluateSplit:     "
-                  << std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
-                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-                  << time_evaluate_split / total_time * 100 << "%)\n"
-                  << "ApplySplit:        "
-                  << std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
-                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-                  << time_apply_split / total_time * 100 << "%)\n"
-                  << "========================================\n"
-                  << "Total:             "
-                  << std::fixed << std::setw(6) << std::setprecision(4) << total_time;
-      }
-    }
-
-    inline void BuildHist(const std::vector<GradientPair>& gpair,
-                          const RowSetCollection::Elem row_indices,
-                          const GHistIndexMatrix& gmat,
-                          const GHistIndexBlockMatrix& gmatb,
-                          GHistRow hist) {
-      if (fhparam_.enable_feature_grouping > 0) {
-        hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist);
-      } else {
-        hist_builder_.BuildHist(gpair, row_indices, gmat, hist);
-      }
-    }
-
-    inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
-      hist_builder_.SubtractionTrick(self, sibling, parent);
-    }
-
-    inline bool UpdatePredictionCache(const DMatrix* data,
-                                      HostDeviceVector<bst_float>* p_out_preds) {
-      std::vector<bst_float>& out_preds = p_out_preds->HostVector();
-
-      // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
-      // conjunction with Update().
-      if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
-        return false;
-      }
-
-      if (leaf_value_cache_.empty()) {
-        leaf_value_cache_.resize(p_last_tree_->param.num_nodes,
-          std::numeric_limits<float>::infinity());
-      }
-
-      CHECK_GT(out_preds.size(), 0U);
-
-      for (const RowSetCollection::Elem rowset : row_set_collection_) {
-        if (rowset.begin != nullptr && rowset.end != nullptr) {
-          int nid = rowset.node_id;
-          bst_float leaf_value;
-          // if a node is marked as deleted by the pruner, traverse upward to locate
-          // a non-deleted leaf.
-          if ((*p_last_tree_)[nid].IsDeleted()) {
-            while ((*p_last_tree_)[nid].IsDeleted()) {
-              nid = (*p_last_tree_)[nid].Parent();
-            }
-            CHECK((*p_last_tree_)[nid].IsLeaf());
-          }
-          leaf_value = (*p_last_tree_)[nid].LeafValue();
-
-          for (const size_t* it = rowset.begin; it < rowset.end; ++it) {
-            out_preds[*it] += leaf_value;
-          }
-        }
-      }
-
-      return true;
-    }
-
-   protected:
-    // initialize temp data structure
-    inline void InitData(const GHistIndexMatrix& gmat,
-                         const std::vector<GradientPair>& gpair,
-                         const DMatrix& fmat,
-                         const RegTree& tree) {
-      CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
-          << "ColMakerHist: can only grow new tree";
-      CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
-          << "max_depth or max_leaves cannot be both 0 (unlimited); "
-          << "at least one should be a positive quantity.";
-      if (param_.grow_policy == TrainParam::kDepthWise) {
-        CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
-          << "when grow_policy is depthwise.";
-      }
-      const auto& info = fmat.Info();
-
-      {
-        // initialize the row set
-        row_set_collection_.Clear();
-        // clear local prediction cache
-        leaf_value_cache_.clear();
-        // initialize histogram collection
-        uint32_t nbins = gmat.cut.row_ptr.back();
-        hist_.Init(nbins);
-
-        // initialize histogram builder
-        #pragma omp parallel
-        {
-          this->nthread_ = omp_get_num_threads();
-        }
-        hist_builder_.Init(this->nthread_, nbins);
-
-        CHECK_EQ(info.root_index_.size(), 0U);
-        std::vector<size_t>& row_indices = row_set_collection_.row_indices_;
-        // mark subsample and build list of member rows
-        if (param_.subsample < 1.0f) {
-          std::bernoulli_distribution coin_flip(param_.subsample);
-          auto& rnd = common::GlobalRandom();
-          for (size_t i = 0; i < info.num_row_; ++i) {
-            if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
-              row_indices.push_back(i);
-            }
-          }
-        } else {
-          for (size_t i = 0; i < info.num_row_; ++i) {
-            if (gpair[i].GetHess() >= 0.0f) {
-              row_indices.push_back(i);
-            }
-          }
-        }
-        row_set_collection_.Init();
-      }
-
-      {
-        /* determine layout of data */
-        const size_t nrow = info.num_row_;
-        const size_t ncol = info.num_col_;
-        const size_t nnz = info.num_nonzero_;
-        // number of discrete bins for feature 0
-        const uint32_t nbins_f0 = gmat.cut.row_ptr[1] - gmat.cut.row_ptr[0];
-        if (nrow * ncol == nnz) {
-          // dense data with zero-based indexing
-          data_layout_ = kDenseDataZeroBased;
-        } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
-          // dense data with one-based indexing
-          data_layout_ = kDenseDataOneBased;
-        } else {
-          // sparse data
-          data_layout_ = kSparseData;
-        }
-      }
-      {
-        // store a pointer to the tree
-        p_last_tree_ = &tree;
-        // store a pointer to training data
-        p_last_fmat_ = &fmat;
-        // initialize feature index
-        if (data_layout_ == kDenseDataOneBased) {
-          column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
-                               param_.colsample_bytree, true);
-        } else {
-          column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
-                               param_.colsample_bytree, false);
-        }
-      }
-      if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
-        /* specialized code for dense data:
-           choose the column that has a least positive number of discrete bins.
-           For dense data (with no missing value),
-              the sum of gradient histogram is equal to snode[nid] */
-        const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
-        const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
-        uint32_t min_nbins_per_feature = 0;
-        for (bst_uint i = 0; i < nfeature; ++i) {
-          const uint32_t nbins = row_ptr[i + 1] - row_ptr[i];
-          if (nbins > 0) {
-            if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) {
-              min_nbins_per_feature = nbins;
-              fid_least_bins_ = i;
-            }
-          }
-        }
-        CHECK_GT(min_nbins_per_feature, 0U);
-      }
-      {
-        snode_.reserve(256);
-        snode_.clear();
-      }
-      {
-        if (param_.grow_policy == TrainParam::kLossGuide) {
-          qexpand_.reset(new ExpandQueue(LossGuide));
-        } else {
-          qexpand_.reset(new ExpandQueue(DepthWise));
-        }
-      }
-    }
-
-    inline void EvaluateSplit(int nid,
-                              const GHistIndexMatrix& gmat,
-                              const HistCollection& hist,
-                              const DMatrix& fmat,
-                              const RegTree& tree) {
-      // start enumeration
-      const MetaInfo& info = fmat.Info();
-      const auto& feature_set = column_sampler_.GetFeatureSet(tree.GetDepth(nid)).HostVector();
-      const auto nfeature = static_cast<bst_uint>(feature_set.size());
-      const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
-      best_split_tloc_.resize(nthread);
-      #pragma omp parallel for schedule(static) num_threads(nthread)
-      for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
-        best_split_tloc_[tid] = snode_[nid].best;
-      }
-      #pragma omp parallel for schedule(dynamic) num_threads(nthread)
-      for (bst_omp_uint i = 0; i < nfeature; ++i) {
-        const bst_uint fid = feature_set[i];
-        const unsigned tid = omp_get_thread_num();
-        this->EnumerateSplit(-1, gmat, hist[nid], snode_[nid], info,
-          &best_split_tloc_[tid], fid, nid);
-        this->EnumerateSplit(+1, gmat, hist[nid], snode_[nid], info,
-          &best_split_tloc_[tid], fid, nid);
-      }
-      for (unsigned tid = 0; tid < nthread; ++tid) {
-        snode_[nid].best.Update(best_split_tloc_[tid]);
-      }
-    }
-
-    inline void ApplySplit(int nid,
-                           const GHistIndexMatrix& gmat,
-                           const ColumnMatrix& column_matrix,
-                           const HistCollection& hist,
-                           const DMatrix& fmat,
-                           RegTree* p_tree) {
-      // TODO(hcho3): support feature sampling by levels
-
-      /* 1. Create child nodes */
-      NodeEntry& e = snode_[nid];
-
-      p_tree->AddChilds(nid);
-      (*p_tree)[nid].SetSplit(e.best.SplitIndex(), e.best.split_value, e.best.DefaultLeft());
-      // mark right child as 0, to indicate fresh leaf
-      int cleft = (*p_tree)[nid].LeftChild();
-      int cright = (*p_tree)[nid].RightChild();
-      (*p_tree)[cleft].SetLeaf(0.0f, 0);
-      (*p_tree)[cright].SetLeaf(0.0f, 0);
-
-      /* 2. Categorize member rows */
-      const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
-      row_split_tloc_.resize(nthread);
-      for (bst_omp_uint i = 0; i < nthread; ++i) {
-        row_split_tloc_[i].left.clear();
-        row_split_tloc_[i].right.clear();
-      }
-      const bool default_left = (*p_tree)[nid].DefaultLeft();
-      const bst_uint fid = (*p_tree)[nid].SplitIndex();
-      const bst_float split_pt = (*p_tree)[nid].SplitCond();
-      const uint32_t lower_bound = gmat.cut.row_ptr[fid];
-      const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1];
-      int32_t split_cond = -1;
-      // convert floating-point split_pt into corresponding bin_id
-      // split_cond = -1 indicates that split_pt is less than all known cut points
-      CHECK_LT(upper_bound,
-        static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
-      for (uint32_t i = lower_bound; i < upper_bound; ++i) {
-        if (split_pt == gmat.cut.cut[i]) {
-          split_cond = static_cast<int32_t>(i);
-        }
-      }
-
-      const auto& rowset = row_set_collection_[nid];
-
-      Column column = column_matrix.GetColumn(fid);
-      if (column.GetType() == xgboost::common::kDenseColumn) {
-        ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
-          default_left);
-      } else {
-        ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound,
-          upper_bound, split_cond, default_left);
-      }
-
-      row_set_collection_.AddSplit(
-        nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
-    }
-
-    inline void ApplySplitDenseData(const RowSetCollection::Elem rowset,
-                                    const GHistIndexMatrix& gmat,
-                                    std::vector<RowSetCollection::Split>* p_row_split_tloc,
-                                    const Column& column,
-                                    bst_int split_cond,
-                                    bool default_left) {
-      std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
-      constexpr int kUnroll = 8;  // loop unrolling factor
-      const size_t nrows = rowset.end - rowset.begin;
-      const size_t rest = nrows % kUnroll;
-
-      #pragma omp parallel for num_threads(nthread_) schedule(static)
-      for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
-        const bst_uint tid = omp_get_thread_num();
-        auto& left = row_split_tloc[tid].left;
-        auto& right = row_split_tloc[tid].right;
-        size_t rid[kUnroll];
-        uint32_t rbin[kUnroll];
-        for (int k = 0; k < kUnroll; ++k) {
-          rid[k] = rowset.begin[i + k];
-        }
-        for (int k = 0; k < kUnroll; ++k) {
-          rbin[k] = column.GetFeatureBinIdx(rid[k]);
-        }
-        for (int k = 0; k < kUnroll; ++k) {                      // NOLINT
-          if (rbin[k] == std::numeric_limits<uint32_t>::max()) {  // missing value
-            if (default_left) {
-              left.push_back(rid[k]);
-            } else {
-              right.push_back(rid[k]);
-            }
-          } else {
-            if (static_cast<int32_t>(rbin[k] + column.GetBaseIdx()) <= split_cond) {
-              left.push_back(rid[k]);
-            } else {
-              right.push_back(rid[k]);
-            }
-          }
-        }
-      }
-      for (size_t i = nrows - rest; i < nrows; ++i) {
-        auto& left = row_split_tloc[nthread_-1].left;
-        auto& right = row_split_tloc[nthread_-1].right;
-        const size_t rid = rowset.begin[i];
-        const uint32_t rbin = column.GetFeatureBinIdx(rid);
-        if (rbin == std::numeric_limits<uint32_t>::max()) {  // missing value
-          if (default_left) {
-            left.push_back(rid);
-          } else {
-            right.push_back(rid);
-          }
-        } else {
-          if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
-            left.push_back(rid);
-          } else {
-            right.push_back(rid);
-          }
-        }
-      }
-    }
-
-    inline void ApplySplitSparseData(const RowSetCollection::Elem rowset,
-                                    const GHistIndexMatrix& gmat,
-                                    std::vector<RowSetCollection::Split>* p_row_split_tloc,
-                                    const Column& column,
-                                    bst_uint lower_bound,
-                                    bst_uint upper_bound,
-                                    bst_int split_cond,
-                                    bool default_left) {
-      std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
-      const size_t nrows = rowset.end - rowset.begin;
-
-      #pragma omp parallel num_threads(nthread_)
-      {
-        const auto tid = static_cast<size_t>(omp_get_thread_num());
-        const size_t ibegin = tid * nrows / nthread_;
-        const size_t iend = (tid + 1) * nrows / nthread_;
-        if (ibegin < iend) {  // ensure that [ibegin, iend) is nonempty range
-          // search first nonzero row with index >= rowset[ibegin]
-          const size_t* p = std::lower_bound(column.GetRowData(),
-                                             column.GetRowData() + column.Size(),
-                                             rowset.begin[ibegin]);
-
-          auto& left = row_split_tloc[tid].left;
-          auto& right = row_split_tloc[tid].right;
-          if (p != column.GetRowData() + column.Size() && *p <= rowset.begin[iend - 1]) {
-            size_t cursor = p - column.GetRowData();
-
-            for (size_t i = ibegin; i < iend; ++i) {
-              const size_t rid = rowset.begin[i];
-              while (cursor < column.Size()
-                     && column.GetRowIdx(cursor) < rid
-                     && column.GetRowIdx(cursor) <= rowset.begin[iend - 1]) {
-                ++cursor;
-              }
-              if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) {
-                const uint32_t rbin = column.GetFeatureBinIdx(cursor);
-                if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
-                  left.push_back(rid);
-                } else {
-                  right.push_back(rid);
-                }
-                ++cursor;
-              } else {
-                // missing value
-                if (default_left) {
-                  left.push_back(rid);
-                } else {
-                  right.push_back(rid);
-                }
-              }
-            }
-          } else {  // all rows in [ibegin, iend) have missing values
-            if (default_left) {
-              for (size_t i = ibegin; i < iend; ++i) {
-                const size_t rid = rowset.begin[i];
-                left.push_back(rid);
-              }
-            } else {
-              for (size_t i = ibegin; i < iend; ++i) {
-                const size_t rid = rowset.begin[i];
-                right.push_back(rid);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    inline void InitNewNode(int nid,
-                            const GHistIndexMatrix& gmat,
-                            const std::vector<GradientPair>& gpair,
-                            const DMatrix& fmat,
-                            const RegTree& tree) {
-      {
-        snode_.resize(tree.param.num_nodes, NodeEntry(param_));
-      }
-
-      {
-        auto& stats = snode_[nid].stats;
-        if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
-          /* specialized code for dense data
-             For dense data (with no missing value),
-                the sum of gradient histogram is equal to snode[nid] */
-          GHistRow hist = hist_[nid];
-          const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
-
-          const uint32_t ibegin = row_ptr[fid_least_bins_];
-          const uint32_t iend = row_ptr[fid_least_bins_ + 1];
-          for (uint32_t i = ibegin; i < iend; ++i) {
-            const GHistEntry et = hist.begin[i];
-            stats.Add(et.sum_grad, et.sum_hess);
-          }
-        } else {
-          const RowSetCollection::Elem e = row_set_collection_[nid];
-          for (const size_t* it = e.begin; it < e.end; ++it) {
-            stats.Add(gpair[*it]);
-          }
-        }
-      }
-
-      // calculating the weights
-      {
-        bst_uint parentid = tree[nid].Parent();
-        snode_[nid].weight = static_cast<float>(
-            spliteval_->ComputeWeight(parentid, snode_[nid].stats));
-        snode_[nid].root_gain = static_cast<float>(
-            spliteval_->ComputeScore(parentid, snode_[nid].stats, snode_[nid].weight));
-      }
-    }
-
-    // enumerate the split values of specific feature
-    inline void EnumerateSplit(int d_step,
-                               const GHistIndexMatrix& gmat,
-                               const GHistRow& hist,
-                               const NodeEntry& snode,
-                               const MetaInfo& info,
-                               SplitEntry* p_best,
-                               bst_uint fid,
-                               bst_uint nodeID) {
-      CHECK(d_step == +1 || d_step == -1);
-
-      // aliases
-      const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
-      const std::vector<bst_float>& cut_val = gmat.cut.cut;
-
-      // statistics on both sides of split
-      GradStats c(param_);
-      GradStats e(param_);
-      // best split so far
-      SplitEntry best;
-
-      // bin boundaries
-      CHECK_LE(cut_ptr[fid],
-        static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
-      CHECK_LE(cut_ptr[fid + 1],
-        static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
-      // imin: index (offset) of the minimum value for feature fid
-      //       need this for backward enumeration
-      const auto imin = static_cast<int32_t>(cut_ptr[fid]);
-      // ibegin, iend: smallest/largest cut points for feature fid
-      // use int to allow for value -1
-      int32_t ibegin, iend;
-      if (d_step > 0) {
-        ibegin = static_cast<int32_t>(cut_ptr[fid]);
-        iend = static_cast<int32_t>(cut_ptr[fid + 1]);
-      } else {
-        ibegin = static_cast<int32_t>(cut_ptr[fid + 1]) - 1;
-        iend = static_cast<int32_t>(cut_ptr[fid]) - 1;
-      }
-
-      for (int32_t i = ibegin; i != iend; i += d_step) {
-        // start working
-        // try to find a split
-        e.Add(hist.begin[i].sum_grad, hist.begin[i].sum_hess);
-        if (e.sum_hess >= param_.min_child_weight) {
-          c.SetSubstract(snode.stats, e);
-          if (c.sum_hess >= param_.min_child_weight) {
-            bst_float loss_chg;
-            bst_float split_pt;
-            if (d_step > 0) {
-              // forward enumeration: split at right bound of each bin
-              loss_chg = static_cast<bst_float>(
-                  spliteval_->ComputeSplitScore(nodeID, fid, e, c) -
-                  snode.root_gain);
-              split_pt = cut_val[i];
-            } else {
-              // backward enumeration: split at left bound of each bin
-              loss_chg = static_cast<bst_float>(
-                  spliteval_->ComputeSplitScore(nodeID, fid, c, e) -
-                  snode.root_gain);
-              if (i == imin) {
-                // for leftmost bin, left bound is the smallest feature value
-                split_pt = gmat.cut.min_val[fid];
-              } else {
-                split_pt = cut_val[i - 1];
-              }
-            }
-            best.Update(loss_chg, fid, split_pt, d_step == -1);
-          }
-        }
-      }
-      p_best->Update(best);
-    }
-
-    /* tree growing policies */
-    struct ExpandEntry {
-      int nid;
-      int depth;
-      bst_float loss_chg;
-      unsigned timestamp;
-      ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
-        : nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
-    };
-    inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
-      if (lhs.depth == rhs.depth) {
-        return lhs.timestamp > rhs.timestamp;  // favor small timestamp
-      } else {
-        return lhs.depth > rhs.depth;  // favor small depth
-      }
-    }
-    inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
-      if (lhs.loss_chg == rhs.loss_chg) {
-        return lhs.timestamp > rhs.timestamp;  // favor small timestamp
-      } else {
-        return lhs.loss_chg < rhs.loss_chg;  // favor large loss_chg
-      }
-    }
-
-    //  --data fields--
-    const TrainParam& param_;
-    const FastHistParam& fhparam_;
-    // number of omp thread used during training
-    int nthread_;
-    common::ColumnSampler column_sampler_;
-    // the internal row sets
-    RowSetCollection row_set_collection_;
-    // the temp space for split
-    std::vector<RowSetCollection::Split> row_split_tloc_;
-    std::vector<SplitEntry> best_split_tloc_;
-    /*! \brief TreeNode Data: statistics for each constructed node */
-    std::vector<NodeEntry> snode_;
-    /*! \brief culmulative histogram of gradients. */
-    HistCollection hist_;
-    /*! \brief feature with least # of bins. to be used for dense specialization
-               of InitNewNode() */
-    uint32_t fid_least_bins_;
-    /*! \brief local prediction cache; maps node id to leaf value */
-    std::vector<float> leaf_value_cache_;
-
-    GHistBuilder hist_builder_;
-    std::unique_ptr<TreeUpdater> pruner_;
-    std::unique_ptr<SplitEvaluator> spliteval_;
-
-    // back pointers to tree and data matrix
-    const RegTree* p_last_tree_;
-    const DMatrix* p_last_fmat_;
-
-    using ExpandQueue =
-        std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
-                            std::function<bool(ExpandEntry, ExpandEntry)>>;
-    std::unique_ptr<ExpandQueue> qexpand_;
-
-    enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
-    DataLayout data_layout_;
-  };
-
-  std::unique_ptr<Builder> builder_;
-  std::unique_ptr<TreeUpdater> pruner_;
-  std::unique_ptr<SplitEvaluator> spliteval_;
-};
-
-XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker")
-.describe("Grow tree using quantized histogram.")
-.set_body([]() {
-    return new FastHistMaker();
-  });
-
-}  // namespace tree
-}  // namespace xgboost
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@ -4,11 +4,12 @@
 * \brief use histogram counting to construct a tree
 * \author Tianqi Chen
 */
+#include <rabit/rabit.h>
 #include <xgboost/base.h>
 #include <xgboost/tree_updater.h>
 #include <vector>
 #include <algorithm>
-#include "../common/sync.h"
+
 #include "../common/quantile.h"
 #include "../common/group_data.h"
 #include "./updater_basemaker-inl.h"
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@ -4,12 +4,13 @@
 * \brief prune a tree given the statistics
 * \author Tianqi Chen
 */
-
+#include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>
+
 #include <string>
 #include <memory>
+
 #include "./param.h"
-#include "../common/sync.h"
 #include "../common/io.h"

 namespace xgboost {
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -0,0 +1,748 @@
+/*!
+ * Copyright 2017-2018 by Contributors
+ * \file updater_quantile_hist.cc
+ * \brief use quantized feature values to construct a tree
+ * \author Philip Cho, Tianqi Checn
+ */
+#include <dmlc/timer.h>
+#include <rabit/rabit.h>
+#include <xgboost/tree_updater.h>
+
+#include <cmath>
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include <queue>
+#include <iomanip>
+#include <numeric>
+#include <string>
+#include <utility>
+
+#include "./param.h"
+#include "./updater_quantile_hist.h"
+#include "./split_evaluator.h"
+#include "../common/random.h"
+#include "../common/hist_util.h"
+#include "../common/row_set.h"
+#include "../common/column_matrix.h"
+
+namespace xgboost {
+namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
+
+void QuantileHistMaker::Init(const std::vector<std::pair<std::string, std::string> >& args) {
+  // initialize pruner
+  if (!pruner_) {
+    pruner_.reset(TreeUpdater::Create("prune"));
+  }
+  pruner_->Init(args);
+  param_.InitAllowUnknown(args);
+  is_gmat_initialized_ = false;
+
+  // initialise the split evaluator
+  if (!spliteval_) {
+    spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
+  }
+
+  spliteval_->Init(args);
+}
+
+void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
+                           DMatrix *dmat,
+                           const std::vector<RegTree *> &trees) {
+  GradStats::CheckInfo(dmat->Info());
+  if (is_gmat_initialized_ == false) {
+    double tstart = dmlc::GetTime();
+    gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
+    column_matrix_.Init(gmat_, param_.sparse_threshold);
+    if (param_.enable_feature_grouping > 0) {
+      gmatb_.Init(gmat_, column_matrix_, param_);
+    }
+    is_gmat_initialized_ = true;
+    if (param_.debug_verbose > 0) {
+      LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
+    }
+  }
+  // rescale learning rate according to size of trees
+  float lr = param_.learning_rate;
+  param_.learning_rate = lr / trees.size();
+  // build tree
+  if (!builder_) {
+    builder_.reset(new Builder(
+        param_,
+        std::move(pruner_),
+        std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
+  }
+  for (auto tree : trees) {
+    builder_->Update
+        (gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
+  }
+  param_.learning_rate = lr;
+}
+
+bool QuantileHistMaker::UpdatePredictionCache(
+    const DMatrix* data,
+    HostDeviceVector<bst_float>* out_preds) {
+  if (!builder_ || param_.subsample < 1.0f) {
+    return false;
+  } else {
+    return builder_->UpdatePredictionCache(data, out_preds);
+  }
+}
+
+void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat,
+                                    const GHistIndexBlockMatrix& gmatb,
+                                    const ColumnMatrix& column_matrix,
+                                    HostDeviceVector<GradientPair>* gpair,
+                                    DMatrix* p_fmat,
+                                    RegTree* p_tree) {
+  double gstart = dmlc::GetTime();
+
+  int num_leaves = 0;
+  unsigned timestamp = 0;
+
+  double tstart;
+  double time_init_data = 0;
+  double time_init_new_node = 0;
+  double time_build_hist = 0;
+  double time_evaluate_split = 0;
+  double time_apply_split = 0;
+
+  const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
+
+  spliteval_->Reset();
+
+  tstart = dmlc::GetTime();
+  this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
+  time_init_data = dmlc::GetTime() - tstart;
+
+  // FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
+  CHECK_EQ(p_tree->param.num_roots, 1)
+      << "tree_method=hist does not support multiple roots at this moment";
+  for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
+    tstart = dmlc::GetTime();
+    hist_.AddHistRow(nid);
+    BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid]);
+    time_build_hist += dmlc::GetTime() - tstart;
+
+    tstart = dmlc::GetTime();
+    this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
+    time_init_new_node += dmlc::GetTime() - tstart;
+
+    tstart = dmlc::GetTime();
+    this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
+    time_evaluate_split += dmlc::GetTime() - tstart;
+    qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
+                               snode_[nid].best.loss_chg,
+                               timestamp++));
+    ++num_leaves;
+  }
+
+  while (!qexpand_->empty()) {
+    const ExpandEntry candidate = qexpand_->top();
+    const int nid = candidate.nid;
+    qexpand_->pop();
+    if (candidate.loss_chg <= kRtEps
+        || (param_.max_depth > 0 && candidate.depth == param_.max_depth)
+        || (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) {
+      (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
+    } else {
+      tstart = dmlc::GetTime();
+      this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
+      time_apply_split += dmlc::GetTime() - tstart;
+
+      tstart = dmlc::GetTime();
+      const int cleft = (*p_tree)[nid].LeftChild();
+      const int cright = (*p_tree)[nid].RightChild();
+      hist_.AddHistRow(cleft);
+      hist_.AddHistRow(cright);
+      if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
+        BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
+        SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
+      } else {
+        BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
+        SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
+      }
+      time_build_hist += dmlc::GetTime() - tstart;
+
+      tstart = dmlc::GetTime();
+      this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
+      this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
+      bst_uint featureid = snode_[nid].best.SplitIndex();
+      spliteval_->AddSplit(nid, cleft, cright, featureid,
+                           snode_[cleft].weight, snode_[cright].weight);
+      time_init_new_node += dmlc::GetTime() - tstart;
+
+      tstart = dmlc::GetTime();
+      this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree);
+      this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree);
+      time_evaluate_split += dmlc::GetTime() - tstart;
+
+      qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
+                                 snode_[cleft].best.loss_chg,
+                                 timestamp++));
+      qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
+                                 snode_[cright].best.loss_chg,
+                                 timestamp++));
+
+      ++num_leaves;  // give two and take one, as parent is no longer a leaf
+    }
+  }
+
+  // set all the rest expanding nodes to leaf
+  // This post condition is not needed in current code, but may be necessary
+  // when there are stopping rule that leaves qexpand non-empty
+  while (!qexpand_->empty()) {
+    const int nid = qexpand_->top().nid;
+    qexpand_->pop();
+    (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
+  }
+  // remember auxiliary statistics in the tree node
+  for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+    p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
+    p_tree->Stat(nid).base_weight = snode_[nid].weight;
+    p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
+    snode_[nid].stats.SetLeafVec(param_, p_tree->Leafvec(nid));
+  }
+
+  pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
+
+  if (param_.debug_verbose > 0) {
+    double total_time = dmlc::GetTime() - gstart;
+    LOG(INFO) << "\nInitData:          "
+              << std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
+              << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+              << time_init_data / total_time * 100 << "%)\n"
+              << "InitNewNode:       "
+              << std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
+              << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+              << time_init_new_node / total_time * 100 << "%)\n"
+              << "BuildHist:         "
+              << std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
+              << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+              << time_build_hist / total_time * 100 << "%)\n"
+              << "EvaluateSplit:     "
+              << std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
+              << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+              << time_evaluate_split / total_time * 100 << "%)\n"
+              << "ApplySplit:        "
+              << std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
+              << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+              << time_apply_split / total_time * 100 << "%)\n"
+              << "========================================\n"
+              << "Total:             "
+              << std::fixed << std::setw(6) << std::setprecision(4) << total_time;
+  }
+}
+
+bool QuantileHistMaker::Builder::UpdatePredictionCache(
+    const DMatrix* data,
+    HostDeviceVector<bst_float>* p_out_preds) {
+  std::vector<bst_float>& out_preds = p_out_preds->HostVector();
+
+  // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
+  // conjunction with Update().
+  if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
+    return false;
+  }
+
+  if (leaf_value_cache_.empty()) {
+    leaf_value_cache_.resize(p_last_tree_->param.num_nodes,
+                             std::numeric_limits<float>::infinity());
+  }
+
+  CHECK_GT(out_preds.size(), 0U);
+
+  for (const RowSetCollection::Elem rowset : row_set_collection_) {
+    if (rowset.begin != nullptr && rowset.end != nullptr) {
+      int nid = rowset.node_id;
+      bst_float leaf_value;
+      // if a node is marked as deleted by the pruner, traverse upward to locate
+      // a non-deleted leaf.
+      if ((*p_last_tree_)[nid].IsDeleted()) {
+        while ((*p_last_tree_)[nid].IsDeleted()) {
+          nid = (*p_last_tree_)[nid].Parent();
+        }
+        CHECK((*p_last_tree_)[nid].IsLeaf());
+      }
+      leaf_value = (*p_last_tree_)[nid].LeafValue();
+
+      for (const size_t* it = rowset.begin; it < rowset.end; ++it) {
+        out_preds[*it] += leaf_value;
+      }
+    }
+  }
+
+  return true;
+}
+
+void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
+                                      const std::vector<GradientPair>& gpair,
+                                      const DMatrix& fmat,
+                                      const RegTree& tree) {
+  CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
+      << "ColMakerHist: can only grow new tree";
+  CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
+      << "max_depth or max_leaves cannot be both 0 (unlimited); "
+      << "at least one should be a positive quantity.";
+  if (param_.grow_policy == TrainParam::kDepthWise) {
+    CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
+                                << "when grow_policy is depthwise.";
+  }
+  const auto& info = fmat.Info();
+
+  {
+    // initialize the row set
+    row_set_collection_.Clear();
+    // clear local prediction cache
+    leaf_value_cache_.clear();
+    // initialize histogram collection
+    uint32_t nbins = gmat.cut.row_ptr.back();
+    hist_.Init(nbins);
+
+    // initialize histogram builder
+#pragma omp parallel
+    {
+      this->nthread_ = omp_get_num_threads();
+    }
+    hist_builder_.Init(this->nthread_, nbins);
+
+    CHECK_EQ(info.root_index_.size(), 0U);
+    std::vector<size_t>& row_indices = row_set_collection_.row_indices_;
+    // mark subsample and build list of member rows
+    if (param_.subsample < 1.0f) {
+      std::bernoulli_distribution coin_flip(param_.subsample);
+      auto& rnd = common::GlobalRandom();
+      for (size_t i = 0; i < info.num_row_; ++i) {
+        if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
+          row_indices.push_back(i);
+        }
+      }
+    } else {
+      for (size_t i = 0; i < info.num_row_; ++i) {
+        if (gpair[i].GetHess() >= 0.0f) {
+          row_indices.push_back(i);
+        }
+      }
+    }
+    row_set_collection_.Init();
+  }
+
+  {
+    /* determine layout of data */
+    const size_t nrow = info.num_row_;
+    const size_t ncol = info.num_col_;
+    const size_t nnz = info.num_nonzero_;
+    // number of discrete bins for feature 0
+    const uint32_t nbins_f0 = gmat.cut.row_ptr[1] - gmat.cut.row_ptr[0];
+    if (nrow * ncol == nnz) {
+      // dense data with zero-based indexing
+      data_layout_ = kDenseDataZeroBased;
+    } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
+      // dense data with one-based indexing
+      data_layout_ = kDenseDataOneBased;
+    } else {
+      // sparse data
+      data_layout_ = kSparseData;
+    }
+  }
+  {
+    // store a pointer to the tree
+    p_last_tree_ = &tree;
+    // store a pointer to training data
+    p_last_fmat_ = &fmat;
+    // initialize feature index
+    if (data_layout_ == kDenseDataOneBased) {
+      column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
+                           param_.colsample_bytree, true);
+    } else {
+      column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
+                           param_.colsample_bytree, false);
+    }
+  }
+  if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
+    /* specialized code for dense data:
+       choose the column that has a least positive number of discrete bins.
+       For dense data (with no missing value),
+       the sum of gradient histogram is equal to snode[nid] */
+    const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
+    const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
+    uint32_t min_nbins_per_feature = 0;
+    for (bst_uint i = 0; i < nfeature; ++i) {
+      const uint32_t nbins = row_ptr[i + 1] - row_ptr[i];
+      if (nbins > 0) {
+        if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) {
+          min_nbins_per_feature = nbins;
+          fid_least_bins_ = i;
+        }
+      }
+    }
+    CHECK_GT(min_nbins_per_feature, 0U);
+  }
+  {
+    snode_.reserve(256);
+    snode_.clear();
+  }
+  {
+    if (param_.grow_policy == TrainParam::kLossGuide) {
+      qexpand_.reset(new ExpandQueue(LossGuide));
+    } else {
+      qexpand_.reset(new ExpandQueue(DepthWise));
+    }
+  }
+}
+
+void QuantileHistMaker::Builder::EvaluateSplit(int nid,
+                                           const GHistIndexMatrix& gmat,
+                                           const HistCollection& hist,
+                                           const DMatrix& fmat,
+                                           const RegTree& tree) {
+  // start enumeration
+  const MetaInfo& info = fmat.Info();
+  const auto& feature_set = column_sampler_.GetFeatureSet(
+      tree.GetDepth(nid)).HostVector();
+  const auto nfeature = static_cast<bst_uint>(feature_set.size());
+  const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
+  best_split_tloc_.resize(nthread);
+#pragma omp parallel for schedule(static) num_threads(nthread)
+  for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
+    best_split_tloc_[tid] = snode_[nid].best;
+  }
+#pragma omp parallel for schedule(dynamic) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < nfeature; ++i) {
+    const bst_uint fid = feature_set[i];
+    const unsigned tid = omp_get_thread_num();
+    this->EnumerateSplit(-1, gmat, hist[nid], snode_[nid], info,
+                         &best_split_tloc_[tid], fid, nid);
+    this->EnumerateSplit(+1, gmat, hist[nid], snode_[nid], info,
+                         &best_split_tloc_[tid], fid, nid);
+  }
+  for (unsigned tid = 0; tid < nthread; ++tid) {
+    snode_[nid].best.Update(best_split_tloc_[tid]);
+  }
+}
+
+void QuantileHistMaker::Builder::ApplySplit(int nid,
+                                        const GHistIndexMatrix& gmat,
+                                        const ColumnMatrix& column_matrix,
+                                        const HistCollection& hist,
+                                        const DMatrix& fmat,
+                                        RegTree* p_tree) {
+  // TODO(hcho3): support feature sampling by levels
+
+  /* 1. Create child nodes */
+  NodeEntry& e = snode_[nid];
+
+  p_tree->AddChilds(nid);
+  (*p_tree)[nid].SetSplit(e.best.SplitIndex(), e.best.split_value, e.best.DefaultLeft());
+  // mark right child as 0, to indicate fresh leaf
+  int cleft = (*p_tree)[nid].LeftChild();
+  int cright = (*p_tree)[nid].RightChild();
+  (*p_tree)[cleft].SetLeaf(0.0f, 0);
+  (*p_tree)[cright].SetLeaf(0.0f, 0);
+
+  /* 2. Categorize member rows */
+  const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
+  row_split_tloc_.resize(nthread);
+  for (bst_omp_uint i = 0; i < nthread; ++i) {
+    row_split_tloc_[i].left.clear();
+    row_split_tloc_[i].right.clear();
+  }
+  const bool default_left = (*p_tree)[nid].DefaultLeft();
+  const bst_uint fid = (*p_tree)[nid].SplitIndex();
+  const bst_float split_pt = (*p_tree)[nid].SplitCond();
+  const uint32_t lower_bound = gmat.cut.row_ptr[fid];
+  const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1];
+  int32_t split_cond = -1;
+  // convert floating-point split_pt into corresponding bin_id
+  // split_cond = -1 indicates that split_pt is less than all known cut points
+  CHECK_LT(upper_bound,
+           static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
+  for (uint32_t i = lower_bound; i < upper_bound; ++i) {
+    if (split_pt == gmat.cut.cut[i]) {
+      split_cond = static_cast<int32_t>(i);
+    }
+  }
+
+  const auto& rowset = row_set_collection_[nid];
+
+  Column column = column_matrix.GetColumn(fid);
+  if (column.GetType() == xgboost::common::kDenseColumn) {
+    ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
+                        default_left);
+  } else {
+    ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound,
+                         upper_bound, split_cond, default_left);
+  }
+
+  row_set_collection_.AddSplit(
+      nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
+}
+
+void QuantileHistMaker::Builder::ApplySplitDenseData(
+    const RowSetCollection::Elem rowset,
+    const GHistIndexMatrix& gmat,
+    std::vector<RowSetCollection::Split>* p_row_split_tloc,
+    const Column& column,
+    bst_int split_cond,
+    bool default_left) {
+  std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
+  constexpr int kUnroll = 8;  // loop unrolling factor
+  const size_t nrows = rowset.end - rowset.begin;
+  const size_t rest = nrows % kUnroll;
+
+#pragma omp parallel for num_threads(nthread_) schedule(static)
+  for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
+    const bst_uint tid = omp_get_thread_num();
+    auto& left = row_split_tloc[tid].left;
+    auto& right = row_split_tloc[tid].right;
+    size_t rid[kUnroll];
+    uint32_t rbin[kUnroll];
+    for (int k = 0; k < kUnroll; ++k) {
+      rid[k] = rowset.begin[i + k];
+    }
+    for (int k = 0; k < kUnroll; ++k) {
+      rbin[k] = column.GetFeatureBinIdx(rid[k]);
+    }
+    for (int k = 0; k < kUnroll; ++k) {                      // NOLINT
+      if (rbin[k] == std::numeric_limits<uint32_t>::max()) {  // missing value
+        if (default_left) {
+          left.push_back(rid[k]);
+        } else {
+          right.push_back(rid[k]);
+        }
+      } else {
+        if (static_cast<int32_t>(rbin[k] + column.GetBaseIdx()) <= split_cond) {
+          left.push_back(rid[k]);
+        } else {
+          right.push_back(rid[k]);
+        }
+      }
+    }
+  }
+  for (size_t i = nrows - rest; i < nrows; ++i) {
+    auto& left = row_split_tloc[nthread_-1].left;
+    auto& right = row_split_tloc[nthread_-1].right;
+    const size_t rid = rowset.begin[i];
+    const uint32_t rbin = column.GetFeatureBinIdx(rid);
+    if (rbin == std::numeric_limits<uint32_t>::max()) {  // missing value
+      if (default_left) {
+        left.push_back(rid);
+      } else {
+        right.push_back(rid);
+      }
+    } else {
+      if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
+        left.push_back(rid);
+      } else {
+        right.push_back(rid);
+      }
+    }
+  }
+}
+
+void QuantileHistMaker::Builder::ApplySplitSparseData(
+    const RowSetCollection::Elem rowset,
+    const GHistIndexMatrix& gmat,
+    std::vector<RowSetCollection::Split>* p_row_split_tloc,
+    const Column& column,
+    bst_uint lower_bound,
+    bst_uint upper_bound,
+    bst_int split_cond,
+    bool default_left) {
+  std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
+  const size_t nrows = rowset.end - rowset.begin;
+
+#pragma omp parallel num_threads(nthread_)
+  {
+    const auto tid = static_cast<size_t>(omp_get_thread_num());
+    const size_t ibegin = tid * nrows / nthread_;
+    const size_t iend = (tid + 1) * nrows / nthread_;
+    if (ibegin < iend) {  // ensure that [ibegin, iend) is nonempty range
+      // search first nonzero row with index >= rowset[ibegin]
+      const size_t* p = std::lower_bound(column.GetRowData(),
+                                         column.GetRowData() + column.Size(),
+                                         rowset.begin[ibegin]);
+
+      auto& left = row_split_tloc[tid].left;
+      auto& right = row_split_tloc[tid].right;
+      if (p != column.GetRowData() + column.Size() && *p <= rowset.begin[iend - 1]) {
+        size_t cursor = p - column.GetRowData();
+
+        for (size_t i = ibegin; i < iend; ++i) {
+          const size_t rid = rowset.begin[i];
+          while (cursor < column.Size()
+                 && column.GetRowIdx(cursor) < rid
+                 && column.GetRowIdx(cursor) <= rowset.begin[iend - 1]) {
+            ++cursor;
+          }
+          if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) {
+            const uint32_t rbin = column.GetFeatureBinIdx(cursor);
+            if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
+              left.push_back(rid);
+            } else {
+              right.push_back(rid);
+            }
+            ++cursor;
+          } else {
+            // missing value
+            if (default_left) {
+              left.push_back(rid);
+            } else {
+              right.push_back(rid);
+            }
+          }
+        }
+      } else {  // all rows in [ibegin, iend) have missing values
+        if (default_left) {
+          for (size_t i = ibegin; i < iend; ++i) {
+            const size_t rid = rowset.begin[i];
+            left.push_back(rid);
+          }
+        } else {
+          for (size_t i = ibegin; i < iend; ++i) {
+            const size_t rid = rowset.begin[i];
+            right.push_back(rid);
+          }
+        }
+      }
+    }
+  }
+}
+
+void QuantileHistMaker::Builder::InitNewNode(int nid,
+                                         const GHistIndexMatrix& gmat,
+                                         const std::vector<GradientPair>& gpair,
+                                         const DMatrix& fmat,
+                                         const RegTree& tree) {
+  {
+    snode_.resize(tree.param.num_nodes, NodeEntry(param_));
+  }
+
+  {
+    auto& stats = snode_[nid].stats;
+    if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
+      /* specialized code for dense data
+         For dense data (with no missing value),
+         the sum of gradient histogram is equal to snode[nid] */
+      GHistRow hist = hist_[nid];
+      const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
+
+      const uint32_t ibegin = row_ptr[fid_least_bins_];
+      const uint32_t iend = row_ptr[fid_least_bins_ + 1];
+      for (uint32_t i = ibegin; i < iend; ++i) {
+        const GHistEntry et = hist.begin[i];
+        stats.Add(et.sum_grad, et.sum_hess);
+      }
+    } else {
+      const RowSetCollection::Elem e = row_set_collection_[nid];
+      for (const size_t* it = e.begin; it < e.end; ++it) {
+        stats.Add(gpair[*it]);
+      }
+    }
+  }
+
+  // calculating the weights
+  {
+    bst_uint parentid = tree[nid].Parent();
+    snode_[nid].weight = static_cast<float>(
+        spliteval_->ComputeWeight(parentid, snode_[nid].stats));
+    snode_[nid].root_gain = static_cast<float>(
+        spliteval_->ComputeScore(parentid, snode_[nid].stats, snode_[nid].weight));
+  }
+}
+
+// enumerate the split values of specific feature
+void QuantileHistMaker::Builder::EnumerateSplit(int d_step,
+                                            const GHistIndexMatrix& gmat,
+                                            const GHistRow& hist,
+                                            const NodeEntry& snode,
+                                            const MetaInfo& info,
+                                            SplitEntry* p_best,
+                                            bst_uint fid,
+                                            bst_uint nodeID) {
+  CHECK(d_step == +1 || d_step == -1);
+
+  // aliases
+  const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
+  const std::vector<bst_float>& cut_val = gmat.cut.cut;
+
+  // statistics on both sides of split
+  GradStats c(param_);
+  GradStats e(param_);
+  // best split so far
+  SplitEntry best;
+
+  // bin boundaries
+  CHECK_LE(cut_ptr[fid],
+           static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
+  CHECK_LE(cut_ptr[fid + 1],
+           static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
+  // imin: index (offset) of the minimum value for feature fid
+  //       need this for backward enumeration
+  const auto imin = static_cast<int32_t>(cut_ptr[fid]);
+  // ibegin, iend: smallest/largest cut points for feature fid
+  // use int to allow for value -1
+  int32_t ibegin, iend;
+  if (d_step > 0) {
+    ibegin = static_cast<int32_t>(cut_ptr[fid]);
+    iend = static_cast<int32_t>(cut_ptr[fid + 1]);
+  } else {
+    ibegin = static_cast<int32_t>(cut_ptr[fid + 1]) - 1;
+    iend = static_cast<int32_t>(cut_ptr[fid]) - 1;
+  }
+
+  for (int32_t i = ibegin; i != iend; i += d_step) {
+    // start working
+    // try to find a split
+    e.Add(hist.begin[i].sum_grad, hist.begin[i].sum_hess);
+    if (e.sum_hess >= param_.min_child_weight) {
+      c.SetSubstract(snode.stats, e);
+      if (c.sum_hess >= param_.min_child_weight) {
+        bst_float loss_chg;
+        bst_float split_pt;
+        if (d_step > 0) {
+          // forward enumeration: split at right bound of each bin
+          loss_chg = static_cast<bst_float>(
+              spliteval_->ComputeSplitScore(nodeID, fid, e, c) -
+              snode.root_gain);
+          split_pt = cut_val[i];
+        } else {
+          // backward enumeration: split at left bound of each bin
+          loss_chg = static_cast<bst_float>(
+              spliteval_->ComputeSplitScore(nodeID, fid, c, e) -
+              snode.root_gain);
+          if (i == imin) {
+            // for leftmost bin, left bound is the smallest feature value
+            split_pt = gmat.cut.min_val[fid];
+          } else {
+            split_pt = cut_val[i - 1];
+          }
+        }
+        best.Update(loss_chg, fid, split_pt, d_step == -1);
+      }
+    }
+  }
+  p_best->Update(best);
+}
+
+XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker")
+.describe("(Deprecated, use grow_quantile_histmaker instead.)"
+          " Grow tree using quantized histogram.")
+.set_body(
+    []() {
+      LOG(WARNING) << "grow_fast_histmaker is deprecated, "
+                   << "use grow_quantile_histmaker instead.";
+      return new QuantileHistMaker();
+    });
+
+XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
+.describe("Grow tree using quantized histogram.")
+.set_body(
+    []() {
+      return new QuantileHistMaker();
+    });
+
+}  // namespace tree
+}  // namespace xgboost
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@ -0,0 +1,238 @@
+/*!
+ * Copyright 2017-2018 by Contributors
+ * \file updater_quantile_hist.h
+ * \brief use quantized feature values to construct a tree
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
+#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
+
+#include <rabit/rabit.h>
+#include <xgboost/tree_updater.h>
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <queue>
+#include <utility>
+
+#include "./param.h"
+#include "./split_evaluator.h"
+#include "../common/random.h"
+#include "../common/hist_util.h"
+#include "../common/row_set.h"
+#include "../common/column_matrix.h"
+
+namespace xgboost {
+namespace tree {
+
+using xgboost::common::HistCutMatrix;
+using xgboost::common::GHistIndexMatrix;
+using xgboost::common::GHistIndexBlockMatrix;
+using xgboost::common::GHistIndexRow;
+using xgboost::common::GHistEntry;
+using xgboost::common::HistCollection;
+using xgboost::common::RowSetCollection;
+using xgboost::common::GHistRow;
+using xgboost::common::GHistBuilder;
+using xgboost::common::ColumnMatrix;
+using xgboost::common::Column;
+
+/*! \brief construct a tree using quantized feature values */
+class QuantileHistMaker: public TreeUpdater {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override;
+
+  void Update(HostDeviceVector<GradientPair>* gpair,
+              DMatrix* dmat,
+              const std::vector<RegTree*>& trees) override;
+
+  bool UpdatePredictionCache(const DMatrix* data,
+                             HostDeviceVector<bst_float>* out_preds) override;
+
+ protected:
+  // training parameter
+  TrainParam param_;
+  // quantized data matrix
+  GHistIndexMatrix gmat_;
+  // (optional) data matrix with feature grouping
+  GHistIndexBlockMatrix gmatb_;
+  // column accessor
+  ColumnMatrix column_matrix_;
+  bool is_gmat_initialized_;
+
+  // data structure
+  struct NodeEntry {
+    /*! \brief statics for node entry */
+    GradStats stats;
+    /*! \brief loss of this node, without split */
+    bst_float root_gain;
+    /*! \brief weight calculated related to current data */
+    float weight;
+    /*! \brief current best solution */
+    SplitEntry best;
+    // constructor
+    explicit NodeEntry(const TrainParam& param)
+        : stats(param), root_gain(0.0f), weight(0.0f) {
+    }
+  };
+  // actual builder that runs the algorithm
+
+  struct Builder {
+   public:
+    // constructor
+    explicit Builder(const TrainParam& param,
+                     std::unique_ptr<TreeUpdater> pruner,
+                     std::unique_ptr<SplitEvaluator> spliteval)
+      : param_(param), pruner_(std::move(pruner)),
+        spliteval_(std::move(spliteval)), p_last_tree_(nullptr),
+        p_last_fmat_(nullptr) {}
+    // update one tree, growing
+    virtual void Update(const GHistIndexMatrix& gmat,
+                        const GHistIndexBlockMatrix& gmatb,
+                        const ColumnMatrix& column_matrix,
+                        HostDeviceVector<GradientPair>* gpair,
+                        DMatrix* p_fmat,
+                        RegTree* p_tree);
+
+    inline void BuildHist(const std::vector<GradientPair>& gpair,
+                          const RowSetCollection::Elem row_indices,
+                          const GHistIndexMatrix& gmat,
+                          const GHistIndexBlockMatrix& gmatb,
+                          GHistRow hist) {
+      if (param_.enable_feature_grouping > 0) {
+        hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist);
+      } else {
+        hist_builder_.BuildHist(gpair, row_indices, gmat, hist);
+      }
+    }
+
+    inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
+      hist_builder_.SubtractionTrick(self, sibling, parent);
+    }
+
+    bool UpdatePredictionCache(const DMatrix* data,
+                               HostDeviceVector<bst_float>* p_out_preds);
+
+   protected:
+    // initialize temp data structure
+    void InitData(const GHistIndexMatrix& gmat,
+                  const std::vector<GradientPair>& gpair,
+                  const DMatrix& fmat,
+                  const RegTree& tree);
+
+    void EvaluateSplit(int nid,
+                       const GHistIndexMatrix& gmat,
+                       const HistCollection& hist,
+                       const DMatrix& fmat,
+                       const RegTree& tree);
+
+    void ApplySplit(int nid,
+                    const GHistIndexMatrix& gmat,
+                    const ColumnMatrix& column_matrix,
+                    const HistCollection& hist,
+                    const DMatrix& fmat,
+                    RegTree* p_tree);
+
+    void ApplySplitDenseData(const RowSetCollection::Elem rowset,
+                             const GHistIndexMatrix& gmat,
+                             std::vector<RowSetCollection::Split>* p_row_split_tloc,
+                             const Column& column,
+                             bst_int split_cond,
+                             bool default_left);
+
+    void ApplySplitSparseData(const RowSetCollection::Elem rowset,
+                              const GHistIndexMatrix& gmat,
+                              std::vector<RowSetCollection::Split>* p_row_split_tloc,
+                              const Column& column,
+                              bst_uint lower_bound,
+                              bst_uint upper_bound,
+                              bst_int split_cond,
+                              bool default_left);
+
+    void InitNewNode(int nid,
+                     const GHistIndexMatrix& gmat,
+                     const std::vector<GradientPair>& gpair,
+                     const DMatrix& fmat,
+                     const RegTree& tree);
+
+    // enumerate the split values of specific feature
+    void EnumerateSplit(int d_step,
+                        const GHistIndexMatrix& gmat,
+                        const GHistRow& hist,
+                        const NodeEntry& snode,
+                        const MetaInfo& info,
+                        SplitEntry* p_best,
+                        bst_uint fid,
+                        bst_uint nodeID);
+
+    /* tree growing policies */
+    struct ExpandEntry {
+      int nid;
+      int depth;
+      bst_float loss_chg;
+      unsigned timestamp;
+      ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
+        : nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
+    };
+    inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
+      if (lhs.depth == rhs.depth) {
+        return lhs.timestamp > rhs.timestamp;  // favor small timestamp
+      } else {
+        return lhs.depth > rhs.depth;  // favor small depth
+      }
+    }
+    inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
+      if (lhs.loss_chg == rhs.loss_chg) {
+        return lhs.timestamp > rhs.timestamp;  // favor small timestamp
+      } else {
+        return lhs.loss_chg < rhs.loss_chg;  // favor large loss_chg
+      }
+    }
+
+    //  --data fields--
+    const TrainParam& param_;
+    // number of omp thread used during training
+    int nthread_;
+    common::ColumnSampler column_sampler_;
+    // the internal row sets
+    RowSetCollection row_set_collection_;
+    // the temp space for split
+    std::vector<RowSetCollection::Split> row_split_tloc_;
+    std::vector<SplitEntry> best_split_tloc_;
+    /*! \brief TreeNode Data: statistics for each constructed node */
+    std::vector<NodeEntry> snode_;
+    /*! \brief culmulative histogram of gradients. */
+    HistCollection hist_;
+    /*! \brief feature with least # of bins. to be used for dense specialization
+               of InitNewNode() */
+    uint32_t fid_least_bins_;
+    /*! \brief local prediction cache; maps node id to leaf value */
+    std::vector<float> leaf_value_cache_;
+
+    GHistBuilder hist_builder_;
+    std::unique_ptr<TreeUpdater> pruner_;
+    std::unique_ptr<SplitEvaluator> spliteval_;
+
+    // back pointers to tree and data matrix
+    const RegTree* p_last_tree_;
+    const DMatrix* p_last_fmat_;
+
+    using ExpandQueue =
+        std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
+                            std::function<bool(ExpandEntry, ExpandEntry)>>;
+    std::unique_ptr<ExpandQueue> qexpand_;
+
+    enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
+    DataLayout data_layout_;
+  };
+
+  std::unique_ptr<Builder> builder_;
+  std::unique_ptr<TreeUpdater> pruner_;
+  std::unique_ptr<SplitEvaluator> spliteval_;
+};
+
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@ -4,12 +4,13 @@
 * \brief refresh the statistics and leaf value on the tree on the dataset
 * \author Tianqi Chen
 */
-
+#include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>
+
 #include <vector>
 #include <limits>
+
 #include "./param.h"
-#include "../common/sync.h"
 #include "../common/io.h"

 namespace xgboost {
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@ -5,12 +5,12 @@
          a refresh is needed to make the statistics exactly correct
 * \author Tianqi Chen
 */
-
+#include <rabit/rabit.h>
 #include <xgboost/base.h>
 #include <xgboost/tree_updater.h>
 #include <vector>
 #include <algorithm>
-#include "../common/sync.h"
+
 #include "../common/quantile.h"
 #include "../common/group_data.h"
 #include "./updater_basemaker-inl.h"
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@ -7,7 +7,6 @@
 #include <vector>
 #include <string>
 #include <limits>
-#include "../common/sync.h"
 #include "../common/io.h"

 namespace xgboost {
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -32,7 +32,7 @@ TEST(learner, SelectTreeMethod) {
            "grow_colmaker,prune");
  learner->Configure({arg("tree_method", "hist")});
  ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
-            "grow_fast_histmaker");
+            "grow_quantile_histmaker");
 #ifdef XGBOOST_USE_CUDA
  learner->Configure({arg("tree_method", "gpu_exact")});
  ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -328,8 +328,8 @@ TEST(GpuHist, ApplySplit) {
  shard->row_stride = n_cols;
  thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
                   shard->ridx.CurrentDVec().tend());
+  // Free inside DeviceShard
  dh::safe_cuda(cudaMallocHost(&(shard->tmp_pinned), sizeof(int64_t)));
-
  // Initialize GPUHistMaker
  hist_maker.param_ = param;
  RegTree tree;
@ -390,15 +390,5 @@ TEST(GpuHist, ApplySplit) {
  ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
 }

-TEST(GpuHist, MGPU_mock) {
-  // Attempt to choose multiple GPU devices
-  int ngpu;
-  dh::safe_cuda(cudaGetDeviceCount(&ngpu));
-  CHECK_GT(ngpu, 1);
-  for (int i = 0; i < ngpu; ++i) {
-    dh::safe_cuda(cudaSetDevice(i));
-  }
-}
-
 }  // namespace tree
 }  // namespace xgboost
--- a/tests/cpp/tree/test_param.cc
+++ b/tests/cpp/tree/test_param.cc
@ -1,7 +1,7 @@
 // Copyright by Contributors
 #include "../../../src/tree/param.h"
-
 #include "../helpers.h"
+#include <gtest/gtest.h>

 TEST(Param, VectorIOStream) {
  std::vector<int> vals = {3, 2, 1};
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@ -0,0 +1,72 @@
+/*!
+ * Copyright 2018 by Contributors
+ */
+#include "../helpers.h"
+#include "../../../src/common/host_device_vector.h"
+#include <xgboost/tree_updater.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include <memory>
+
+namespace xgboost {
+namespace tree {
+
+TEST(Updater, Prune) {
+  int constexpr n_rows = 32, n_cols = 16;
+
+  std::vector<std::pair<std::string, std::string>> cfg;
+  cfg.push_back(std::pair<std::string, std::string>(
+      "num_feature", std::to_string(n_cols)));
+  cfg.push_back(std::pair<std::string, std::string>(
+      "min_split_loss", "10"));
+  cfg.push_back(std::pair<std::string, std::string>(
+      "silent", "1"));
+
+  // These data are just place holders.
+  HostDeviceVector<GradientPair> gpair =
+      { {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
+        {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
+  auto dmat = CreateDMatrix(32, 16, 0.4, 3);
+
+  // prepare tree
+  RegTree tree = RegTree();
+  tree.InitModel();
+  tree.param.InitAllowUnknown(cfg);
+  std::vector<RegTree*> trees {&tree};
+  // prepare pruner
+  std::unique_ptr<TreeUpdater> pruner(TreeUpdater::Create("prune"));
+  pruner->Init(cfg);
+
+  // loss_chg < min_split_loss;
+  tree.AddChilds(0);
+  int cleft = tree[0].LeftChild();
+  int cright = tree[0].RightChild();
+  tree[cleft].SetLeaf(0.3f, 0);
+  tree[cright].SetLeaf(0.4f, 0);
+  pruner->Update(&gpair, dmat->get(), trees);
+
+  ASSERT_EQ(tree.NumExtraNodes(), 0);
+
+  // loss_chg > min_split_loss;
+  tree.AddChilds(0);
+  cleft = tree[0].LeftChild();
+  cright = tree[0].RightChild();
+  tree[cleft].SetLeaf(0.3f, 0);
+  tree[cright].SetLeaf(0.4f, 0);
+  tree.Stat(0).loss_chg = 11;
+  pruner->Update(&gpair, dmat->get(), trees);
+
+  ASSERT_EQ(tree.NumExtraNodes(), 2);
+
+  // loss_chg == min_split_loss;
+  tree.Stat(0).loss_chg = 10;
+  pruner->Update(&gpair, dmat->get(), trees);
+
+  ASSERT_EQ(tree.NumExtraNodes(), 2);
+
+  delete dmat;
+}
+
+}  // namespace tree
+}  // namespace xgboost
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@ -0,0 +1,181 @@
+/*!
+ * Copyright 2018 by Contributors
+ */
+#include "../helpers.h"
+#include "../../../src/tree/param.h"
+#include "../../../src/tree/updater_quantile_hist.h"
+#include "../../../src/common/host_device_vector.h"
+
+#include <xgboost/tree_updater.h>
+#include <gtest/gtest.h>
+
+#include <vector>
+#include <string>
+
+namespace xgboost {
+namespace tree {
+
+class QuantileHistMock : public QuantileHistMaker {
+  static double constexpr kEps = 1e-6;
+
+  struct BuilderMock : public QuantileHistMaker::Builder {
+    using RealImpl = QuantileHistMaker::Builder;
+
+    BuilderMock(const TrainParam& param,
+                     std::unique_ptr<TreeUpdater> pruner,
+                     std::unique_ptr<SplitEvaluator> spliteval)
+        : RealImpl(param, std::move(pruner), std::move(spliteval)) {}
+
+   public:
+    void TestInitData(const GHistIndexMatrix& gmat,
+                  const std::vector<GradientPair>& gpair,
+                  const DMatrix& fmat,
+                  const RegTree& tree) {
+      RealImpl::InitData(gmat, gpair, fmat, tree);
+      ASSERT_EQ(data_layout_, kSparseData);
+    }
+
+    void TestBuildHist(int nid,
+                       const GHistIndexMatrix& gmat,
+                       const DMatrix& fmat,
+                       const RegTree& tree) {
+      std::vector<GradientPair> gpair =
+          { {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
+            {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
+      RealImpl::InitData(gmat, gpair, fmat, tree);
+      GHistIndexBlockMatrix quantile_index_block;
+      hist_.AddHistRow(nid);
+      BuildHist(gpair, row_set_collection_[nid],
+                gmat, quantile_index_block, hist_[nid]);
+      std::vector<GradientPairPrecise> solution {
+        {0.27, 0.29}, {0.27, 0.29}, {0.47, 0.49},
+        {0.27, 0.29}, {0.57, 0.59}, {0.26, 0.27},
+        {0.37, 0.39}, {0.23, 0.24}, {0.37, 0.39},
+        {0.27, 0.28}, {0.27, 0.29}, {0.37, 0.39},
+        {0.26, 0.27}, {0.23, 0.24}, {0.57, 0.59},
+        {0.47, 0.49}, {0.47, 0.49}, {0.37, 0.39},
+        {0.26, 0.27}, {0.23, 0.24}, {0.27, 0.28},
+        {0.57, 0.59}, {0.23, 0.24}, {0.47, 0.49}};
+
+      for (size_t i = 0; i < hist_[nid].size; ++i) {
+        GradientPairPrecise sol = solution[i];
+        ASSERT_NEAR(sol.GetGrad(), hist_[nid].begin[i].sum_grad, kEps);
+        ASSERT_NEAR(sol.GetHess(), hist_[nid].begin[i].sum_hess, kEps);
+      }
+    }
+
+    void TestEvaluateSplit(const GHistIndexBlockMatrix& quantile_index_block,
+                           const RegTree& tree) {
+      std::vector<GradientPair> row_gpairs =
+          { {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
+            {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
+      size_t constexpr max_bins = 4;
+      auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3);  // dense
+
+      common::GHistIndexMatrix gmat;
+      gmat.Init((*dmat).get(), max_bins);
+
+      RealImpl::InitData(gmat, row_gpairs, *(*dmat), tree);
+      hist_.AddHistRow(0);
+
+      BuildHist(row_gpairs, row_set_collection_[0],
+                gmat, quantile_index_block, hist_[0]);
+
+      RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree);
+      // Manipulate the root_gain so that I don't have to invent an actual
+      // split.  Yes, I'm cheating.
+      snode_[0].root_gain = 0.8;
+      RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree);
+
+      ASSERT_NEAR(snode_.at(0).best.loss_chg, 0.7128048, kEps);
+      ASSERT_EQ(snode_.at(0).best.SplitIndex(), 10);
+      ASSERT_NEAR(snode_.at(0).best.split_value, 0.182258, kEps);
+
+      delete dmat;
+    }
+  };
+
+  int static constexpr n_rows = 8, n_cols = 16;
+  std::shared_ptr<xgboost::DMatrix> *dmat;
+  const std::vector<std::pair<std::string, std::string> > cfg;
+  std::shared_ptr<BuilderMock> builder_;
+
+ public:
+  explicit QuantileHistMock(
+      const std::vector<std::pair<std::string, std::string> >& args) :
+      cfg{args} {
+    QuantileHistMaker::Init(args);
+    builder_.reset(
+        new BuilderMock(
+            param_,
+            std::move(pruner_),
+            std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
+    dmat = CreateDMatrix(n_rows, n_cols, 0.8, 3);
+  }
+  ~QuantileHistMock() { delete dmat; }
+
+  static size_t GetNumColumns() { return n_cols; }
+
+  void TestInitData() {
+    size_t constexpr max_bins = 4;
+    common::GHistIndexMatrix gmat;
+    gmat.Init((*dmat).get(), max_bins);
+
+    RegTree tree = RegTree();
+    tree.InitModel();
+    tree.param.InitAllowUnknown(cfg);
+
+    std::vector<GradientPair> gpair =
+        { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
+          {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
+
+    builder_->TestInitData(gmat, gpair, *(*dmat), tree);
+  }
+
+  void TestBuildHist() {
+    RegTree tree = RegTree();
+    tree.InitModel();
+    tree.param.InitAllowUnknown(cfg);
+
+    size_t constexpr max_bins = 4;
+    common::GHistIndexMatrix gmat;
+    gmat.Init((*dmat).get(), max_bins);
+
+    builder_->TestBuildHist(0, gmat, *(*dmat).get(), tree);
+  }
+
+  void TestEvaluateSplit() {
+    RegTree tree = RegTree();
+    tree.InitModel();
+    tree.param.InitAllowUnknown(cfg);
+
+    builder_->TestEvaluateSplit(gmatb_, tree);
+  }
+};
+
+TEST(Updater, QuantileHist_InitData) {
+  std::vector<std::pair<std::string, std::string>> cfg
+      {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())}};
+  QuantileHistMock maker(cfg);
+  maker.TestInitData();
+}
+
+TEST(Updater, QuantileHist_BuildHist) {
+  // Don't enable feature grouping
+  std::vector<std::pair<std::string, std::string>> cfg
+      {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
+       {"enable_feature_grouping", std::to_string(0)}};
+  QuantileHistMock maker(cfg);
+  maker.TestBuildHist();
+}
+
+TEST(Updater, QuantileHist_EvalSplits) {
+  std::vector<std::pair<std::string, std::string>> cfg
+      {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
+       {"split_evaluator", "elastic_net"}};
+  QuantileHistMock maker(cfg);
+  maker.TestEvaluateSplit();
+}
+
+}  // namespace tree
+}  // namespace xgboost
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@ -0,0 +1,57 @@
+/*!
+ * Copyright 2018 by Contributors
+ */
+#include "../helpers.h"
+#include "../../../src/common/host_device_vector.h"
+#include <xgboost/tree_updater.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include <memory>
+
+namespace xgboost {
+namespace tree {
+
+TEST(Updater, Refresh) {
+  int constexpr n_rows = 8, n_cols = 16;
+
+  HostDeviceVector<GradientPair> gpair =
+      { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
+        {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
+  auto dmat = CreateDMatrix(n_rows, n_cols, 0.4, 3);
+  std::vector<std::pair<std::string, std::string>> cfg {
+    {"reg_alpha", "0.0"},
+    {"num_feature", std::to_string(n_cols)},
+    {"reg_lambda", "1"}};
+
+  RegTree tree = RegTree();
+  tree.InitModel();
+  tree.param.InitAllowUnknown(cfg);
+  std::vector<RegTree*> trees {&tree};
+  std::unique_ptr<TreeUpdater> refresher(TreeUpdater::Create("refresh"));
+
+  tree.AddChilds(0);
+  int cleft = tree[0].LeftChild();
+  int cright = tree[0].RightChild();
+  tree[cleft].SetLeaf(0.2f, 0);
+  tree[cright].SetLeaf(0.8f, 0);
+  tree[0].SetSplit(2, 0.2f);
+
+  tree.Stat(cleft).base_weight = 1.2;
+  tree.Stat(cright).base_weight = 1.3;
+
+  refresher->Init(cfg);
+  refresher->Update(&gpair, dmat->get(), trees);
+
+  bst_float constexpr kEps = 1e-6;
+  ASSERT_NEAR(-0.183392, tree[cright].LeafValue(), kEps);
+  ASSERT_NEAR(-0.224489, tree.Stat(0).loss_chg, kEps);
+  ASSERT_NEAR(0, tree.Stat(cleft).loss_chg, kEps);
+  ASSERT_NEAR(0, tree.Stat(1).loss_chg, kEps);
+  ASSERT_NEAR(0, tree.Stat(2).loss_chg, kEps);
+
+  delete dmat;
+}
+
+}  // namespace tree
+}  // namespace xgboost