Merge lossgude and depthwise strategies for CPU hist (#7007)

* fix java/scala test: max depth is also valid parameter for lossguide Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
2021-06-02 20:49:43 +03:00 · 2021-06-02 20:49:43 +03:00 · 57c732655e
commit 57c732655e
parent ee4f51a631
11 changed files with 415 additions and 484 deletions
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@ -387,7 +387,7 @@ public class BoosterImplTest {
    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
    Map<String, Object> paramMap = new HashMap<String, Object>() {
      {
-        put("max_depth", 0);
+        put("max_depth", 3);
        put("silent", 1);
        put("objective", "binary:logistic");
        put("tree_method", "hist");
@ -408,7 +408,7 @@ public class BoosterImplTest {
    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
    Map<String, Object> paramMap = new HashMap<String, Object>() {
      {
-        put("max_depth", 0);
+        put("max_depth", 3);
        put("silent", 1);
        put("objective", "binary:logistic");
        put("tree_method", "hist");
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
@ -159,7 +159,7 @@ class ScalaBoosterImplSuite extends FunSuite {
  test("test with quantile histo lossguide") {
    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
-    val paramMap = List("max_depth" -> "0", "silent" -> "0",
+    val paramMap = List("max_depth" -> "3", "silent" -> "0",
      "objective" -> "binary:logistic", "tree_method" -> "hist",
      "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap
    trainBoosterWithQuantileHisto(trainMat, Map("training" -> trainMat, "test" -> testMat),
@ -169,7 +169,7 @@ class ScalaBoosterImplSuite extends FunSuite {
  test("test with quantile histo lossguide with max bin") {
    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
-    val paramMap = List("max_depth" -> "0", "silent" -> "0",
+    val paramMap = List("max_depth" -> "3", "silent" -> "0",
      "objective" -> "binary:logistic", "tree_method" -> "hist",
      "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
      "eval_metric" -> "auc").toMap
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@ -0,0 +1,92 @@
 /*!
 * Copyright 2021 by XGBoost Contributors
 */
 #ifndef XGBOOST_TREE_DRIVER_H_
 #define XGBOOST_TREE_DRIVER_H_
 #include <xgboost/span.h>
 #include <queue>
 #include <vector>
 #include "./param.h"
 namespace xgboost {
 namespace tree {
 template <typename ExpandEntryT>
 inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
  return lhs.GetNodeId() > rhs.GetNodeId();  // favor small depth
 }
 template <typename ExpandEntryT>
 inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
  if (lhs.GetLossChange() == rhs.GetLossChange()) {
    return lhs.GetNodeId() > rhs.GetNodeId();  // favor small timestamp
  } else {
    return lhs.GetLossChange() < rhs.GetLossChange();  // favor large loss_chg
  }
 }
 // Drives execution of tree building on device
 template <typename ExpandEntryT>
 class Driver {
  using ExpandQueue =
      std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
                          std::function<bool(ExpandEntryT, ExpandEntryT)>>;
 public:
  explicit Driver(TrainParam::TreeGrowPolicy policy)
      : policy_(policy),
        queue_(policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT> :
                                                  LossGuide<ExpandEntryT>) {}
  template <typename EntryIterT>
  void Push(EntryIterT begin, EntryIterT end) {
    for (auto it = begin; it != end; ++it) {
      const ExpandEntryT& e = *it;
      if (e.split.loss_chg > kRtEps) {
        queue_.push(e);
      }
    }
  }
  void Push(const std::vector<ExpandEntryT> &entries) {
    this->Push(entries.begin(), entries.end());
  }
  void Push(const ExpandEntryT e) {
    queue_.push(e);
  }
  bool IsEmpty() {
    return queue_.empty();
  }
  // Return the set of nodes to be expanded
  // This set has no dependencies between entries so they may be expanded in
  // parallel or asynchronously
  std::vector<ExpandEntryT> Pop() {
    if (queue_.empty()) return {};
    // Return a single entry for loss guided mode
    if (policy_ == TrainParam::kLossGuide) {
      ExpandEntryT e = queue_.top();
      queue_.pop();
      return {e};
    }
    // Return nodes on same level for depth wise
    std::vector<ExpandEntryT> result;
    ExpandEntryT e = queue_.top();
    int level = e.depth;
    while (e.depth == level && !queue_.empty()) {
      queue_.pop();
      result.emplace_back(e);
      if (!queue_.empty()) {
        e = queue_.top();
      }
    }
    return result;
  }
 private:
  TrainParam::TreeGrowPolicy policy_;
  ExpandQueue queue_;
 };
 }  // namespace tree
 }  // namespace xgboost
 #endif  // XGBOOST_TREE_DRIVER_H_
--- a/src/tree/gpu_hist/driver.cuh
+++ b/src/tree/gpu_hist/driver.cuh
@ -1,127 +0,0 @@
 /*!
 * Copyright 2020 by XGBoost Contributors
 */
 #ifndef DRIVER_CUH_
 #define DRIVER_CUH_
 #include <xgboost/span.h>
 #include <queue>
 #include "../param.h"
 #include "evaluate_splits.cuh"
 namespace xgboost {
 namespace tree {
 struct ExpandEntry {
  int nid;
  int depth;
  DeviceSplitCandidate split;
  float base_weight { std::numeric_limits<float>::quiet_NaN() };
  float left_weight { std::numeric_limits<float>::quiet_NaN() };
  float right_weight { std::numeric_limits<float>::quiet_NaN() };
  ExpandEntry() = default;
  XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split,
                             float base, float left, float right)
      : nid(nid), depth(depth), split(std::move(split)), base_weight{base},
        left_weight{left}, right_weight{right} {}
  bool IsValid(const TrainParam& param, int num_leaves) const {
    if (split.loss_chg <= kRtEps) return false;
    if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
      return false;
    }
    if (split.loss_chg < param.min_split_loss) {
      return false;
    }
    if (param.max_depth > 0 && depth == param.max_depth) {
      return false;
    }
    if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
      return false;
    }
    return true;
  }
  static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
    if (param.max_depth > 0 && depth >= param.max_depth) return false;
    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
    return true;
  }
  friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
    os << "ExpandEntry: \n";
    os << "nidx: " << e.nid << "\n";
    os << "depth: " << e.depth << "\n";
    os << "loss: " << e.split.loss_chg << "\n";
    os << "left_sum: " << e.split.left_sum << "\n";
    os << "right_sum: " << e.split.right_sum << "\n";
    return os;
  }
 };
 inline bool DepthWise(const ExpandEntry& lhs, const ExpandEntry& rhs) {
  return lhs.depth > rhs.depth;  // favor small depth
 }
 inline bool LossGuide(const ExpandEntry& lhs, const ExpandEntry& rhs) {
  if (lhs.split.loss_chg == rhs.split.loss_chg) {
    return lhs.nid > rhs.nid;  // favor small timestamp
  } else {
    return lhs.split.loss_chg < rhs.split.loss_chg;  // favor large loss_chg
  }
 }
 // Drives execution of tree building on device
 class Driver {
  using ExpandQueue =
      std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
                          std::function<bool(ExpandEntry, ExpandEntry)>>;
 public:
  explicit Driver(TrainParam::TreeGrowPolicy policy)
      : policy_(policy),
        queue_(policy == TrainParam::kDepthWise ? DepthWise : LossGuide) {}
  template <typename EntryIterT>
  void Push(EntryIterT begin,EntryIterT end) {
    for (auto it = begin; it != end; ++it) {
      const ExpandEntry& e = *it;
      if (e.split.loss_chg > kRtEps) {
        queue_.push(e);
      }
    }
  }
  void Push(const std::vector<ExpandEntry> &entries) {
    this->Push(entries.begin(), entries.end());
  }
  // Return the set of nodes to be expanded
  // This set has no dependencies between entries so they may be expanded in
  // parallel or asynchronously
  std::vector<ExpandEntry> Pop() {
    if (queue_.empty()) return {};
    // Return a single entry for loss guided mode
    if (policy_ == TrainParam::kLossGuide) {
      ExpandEntry e = queue_.top();
      queue_.pop();
      return {e};
    }
    // Return nodes on same level for depth wise
    std::vector<ExpandEntry> result;
    ExpandEntry e = queue_.top();
    int level = e.depth;
    while (e.depth == level && !queue_.empty()) {
      queue_.pop();
      result.emplace_back(e);
      if (!queue_.empty()) {
        e = queue_.top();
      }
    }
    return result;
  }
 private:
  TrainParam::TreeGrowPolicy policy_;
  ExpandQueue queue_;
 };
 }  // namespace tree
 }  // namespace xgboost
 #endif  // DRIVER_CUH_
--- a/src/tree/gpu_hist/expand_entry.cuh
+++ b/src/tree/gpu_hist/expand_entry.cuh
@ -0,0 +1,76 @@
 /*!
 * Copyright 2020 by XGBoost Contributors
 */
 #ifndef EXPAND_ENTRY_CUH_
 #define EXPAND_ENTRY_CUH_
 #include <xgboost/span.h>
 #include "../param.h"
 #include "evaluate_splits.cuh"
 namespace xgboost {
 namespace tree {
 struct GPUExpandEntry {
  int nid;
  int depth;
  DeviceSplitCandidate split;
  float base_weight { std::numeric_limits<float>::quiet_NaN() };
  float left_weight { std::numeric_limits<float>::quiet_NaN() };
  float right_weight { std::numeric_limits<float>::quiet_NaN() };
  GPUExpandEntry() = default;
  XGBOOST_DEVICE GPUExpandEntry(int nid, int depth, DeviceSplitCandidate split,
                             float base, float left, float right)
      : nid(nid), depth(depth), split(std::move(split)), base_weight{base},
        left_weight{left}, right_weight{right} {}
  bool IsValid(const TrainParam& param, int num_leaves) const {
    if (split.loss_chg <= kRtEps) return false;
    if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
      return false;
    }
    if (split.loss_chg < param.min_split_loss) {
      return false;
    }
    if (param.max_depth > 0 && depth == param.max_depth) {
      return false;
    }
    if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
      return false;
    }
    return true;
  }
  static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
    if (param.max_depth > 0 && depth >= param.max_depth) return false;
    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
    return true;
  }
  bst_float GetLossChange() const {
    return split.loss_chg;
  }
  int GetNodeId() const {
    return nid;
  }
  int GetDepth() const {
    return depth;
  }
  friend std::ostream& operator<<(std::ostream& os, const GPUExpandEntry& e) {
    os << "GPUExpandEntry: \n";
    os << "nidx: " << e.nid << "\n";
    os << "depth: " << e.depth << "\n";
    os << "loss: " << e.split.loss_chg << "\n";
    os << "left_sum: " << e.split.left_sum << "\n";
    os << "right_sum: " << e.split.right_sum << "\n";
    return os;
  }
 };
 }  // namespace tree
 }  // namespace xgboost
 #endif  // EXPAND_ENTRY_CUH_
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -25,6 +25,7 @@
 #include "../data/ellpack_page.cuh"
 #include "param.h"
 #include "driver.h"
 #include "updater_gpu_common.cuh"
 #include "split_evaluator.h"
 #include "constraints.cuh"
@ -33,7 +34,7 @@
 #include "gpu_hist/row_partitioner.cuh"
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/evaluate_splits.cuh"
-#include "gpu_hist/driver.cuh"
+#include "gpu_hist/expand_entry.cuh"
 namespace xgboost {
 namespace tree {
@ -321,8 +322,8 @@ struct GPUHistMakerDevice {
  }
  void EvaluateLeftRightSplits(
-      ExpandEntry candidate, int left_nidx, int right_nidx, const RegTree& tree,
+      GPUExpandEntry candidate, int left_nidx, int right_nidx, const RegTree& tree,
-      common::Span<ExpandEntry> pinned_candidates_out) {
+      common::Span<GPUExpandEntry> pinned_candidates_out) {
    dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
    GPUTrainingParam gpu_param(param);
    auto left_sampled_features =
@ -363,7 +364,7 @@ struct GPUHistMakerDevice {
        hist.GetNodeHistogram(right_nidx)};
    auto d_splits_out = dh::ToSpan(splits_out);
    EvaluateSplits(d_splits_out, tree_evaluator.GetEvaluator<GPUTrainingParam>(), left, right);
-    dh::TemporaryArray<ExpandEntry> entries(2);
+    dh::TemporaryArray<GPUExpandEntry> entries(2);
    auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
    auto d_entries = entries.data().get();
    dh::LaunchN(device_id, 2, [=] __device__(size_t idx) {
@ -378,12 +379,12 @@ struct GPUHistMakerDevice {
          nidx, gpu_param, GradStats{split.right_sum});
      d_entries[idx] =
-          ExpandEntry{nidx,        candidate.depth + 1, d_splits_out[idx],
+          GPUExpandEntry{nidx,        candidate.depth + 1, d_splits_out[idx],
                      base_weight, left_weight,         right_weight};
    });
    dh::safe_cuda(cudaMemcpyAsync(
        pinned_candidates_out.data(), entries.data().get(),
-        sizeof(ExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
+        sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
  }
  void BuildHist(int nidx) {
@ -569,7 +570,7 @@ struct GPUHistMakerDevice {
  /**
   * \brief Build GPU local histograms for the left and right child of some parent node
   */
-  void BuildHistLeftRight(const ExpandEntry &candidate, int nidx_left,
+  void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left,
        int nidx_right, dh::AllReducer* reducer) {
    auto build_hist_nidx = nidx_left;
    auto subtraction_trick_nidx = nidx_right;
@ -599,7 +600,7 @@ struct GPUHistMakerDevice {
    }
  }
-  void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
+  void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) {
    RegTree& tree = *p_tree;
    auto evaluator = tree_evaluator.GetEvaluator();
    auto parent_sum = candidate.split.left_sum + candidate.split.right_sum;
@ -647,7 +648,7 @@ struct GPUHistMakerDevice {
                                  tree[candidate.nid].RightChild());
  }
-  ExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
+  GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
    constexpr bst_node_t kRootNIdx = 0;
    dh::XGBCachingDeviceAllocator<char> alloc;
    GradientPair root_sum = dh::Reduce(
@ -670,7 +671,7 @@ struct GPUHistMakerDevice {
    // Generate first split
    auto split = this->EvaluateRootSplit(root_sum);
-    dh::TemporaryArray<ExpandEntry> entries(1);
+    dh::TemporaryArray<GPUExpandEntry> entries(1);
    auto d_entries = entries.data().get();
    auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
    GPUTrainingParam gpu_param(param);
@ -681,20 +682,20 @@ struct GPUHistMakerDevice {
      float right_weight = evaluator.CalcWeight(
          kRootNIdx, gpu_param, GradStats{split.right_sum});
      d_entries[0] =
-          ExpandEntry(kRootNIdx, depth, split,
+          GPUExpandEntry(kRootNIdx, depth, split,
                      weight, left_weight, right_weight);
    });
-    ExpandEntry root_entry;
+    GPUExpandEntry root_entry;
    dh::safe_cuda(cudaMemcpyAsync(
        &root_entry, entries.data().get(),
-        sizeof(ExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
+        sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
    return root_entry;
  }
  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
                  RegTree* p_tree, dh::AllReducer* reducer) {
    auto& tree = *p_tree;
-    Driver driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
+    Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
    monitor.Start("Reset");
    this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
@ -710,7 +711,7 @@ struct GPUHistMakerDevice {
    auto expand_set = driver.Pop();
    while (!expand_set.empty()) {
      auto new_candidates =
-          pinned.GetSpan<ExpandEntry>(expand_set.size() * 2, ExpandEntry());
+          pinned.GetSpan<GPUExpandEntry>(expand_set.size() * 2, GPUExpandEntry());
      for (auto i = 0ull; i < expand_set.size(); i++) {
        auto candidate = expand_set.at(i);
@ -724,7 +725,7 @@ struct GPUHistMakerDevice {
        int left_child_nidx = tree[candidate.nid].LeftChild();
        int right_child_nidx = tree[candidate.nid].RightChild();
        // Only create child entries if needed
-        if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
+        if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
                                      num_leaves)) {
          monitor.Start("UpdatePosition");
          this->UpdatePosition(candidate.nid, p_tree);
@ -741,8 +742,8 @@ struct GPUHistMakerDevice {
          monitor.Stop("EvaluateSplits");
        } else {
          // Set default
-          new_candidates[i * 2] = ExpandEntry();
+          new_candidates[i * 2] = GPUExpandEntry();
-          new_candidates[i * 2 + 1] = ExpandEntry();
+          new_candidates[i * 2 + 1] = GPUExpandEntry();
        }
      }
      dh::safe_cuda(cudaDeviceSynchronize());
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -140,10 +140,11 @@ void BatchHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT *builder,
    // Merging histograms from each thread into once
    builder->hist_buffer_.ReduceHist(node, r.begin(), r.end());
-    if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1) {
+    if (!(*p_tree)[entry.nid].IsRoot()) {
      const size_t parent_id = (*p_tree)[entry.nid].Parent();
      const int subtraction_node_id = builder->nodes_for_subtraction_trick_[node].nid;
      auto parent_hist = builder->hist_[parent_id];
-      auto sibling_hist = builder->hist_[entry.sibling_nid];
+      auto sibling_hist = builder->hist_[subtraction_node_id];
      SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
    }
  });
@ -169,13 +170,14 @@ void DistributedHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT* builder
    auto this_local = builder->hist_local_worker_[entry.nid];
    CopyHist(this_local, this_hist, r.begin(), r.end());
-    if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1) {
+    if (!(*p_tree)[entry.nid].IsRoot()) {
      const size_t parent_id = (*p_tree)[entry.nid].Parent();
      const int subtraction_node_id = builder->nodes_for_subtraction_trick_[node].nid;
      auto parent_hist = builder->hist_local_worker_[parent_id];
-      auto sibling_hist = builder->hist_[entry.sibling_nid];
+      auto sibling_hist = builder->hist_[subtraction_node_id];
      SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
      // Store posible parent node
-      auto sibling_local = builder->hist_local_worker_[entry.sibling_nid];
+      auto sibling_local = builder->hist_local_worker_[subtraction_node_id];
      CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
    }
  });
@ -186,12 +188,14 @@ void DistributedHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT* builder
  builder->builder_monitor_.Stop("SyncHistogramsAllreduce");
-  ParallelSubtractionHist(builder, space, builder->nodes_for_explicit_hist_build_, p_tree);
+  ParallelSubtractionHist(builder, space, builder->nodes_for_explicit_hist_build_,
                          builder->nodes_for_subtraction_trick_, p_tree);
  common::BlockedSpace2d space2(builder->nodes_for_subtraction_trick_.size(), [&](size_t) {
    return nbins;
  }, 1024);
-  ParallelSubtractionHist(builder, space2, builder->nodes_for_subtraction_trick_, p_tree);
+  ParallelSubtractionHist(builder, space2, builder->nodes_for_subtraction_trick_,
                          builder->nodes_for_explicit_hist_build_, p_tree);
  builder->builder_monitor_.Stop("SyncHistograms");
 }
@ -199,16 +203,18 @@ template <typename GradientSumT>
 void DistributedHistSynchronizer<GradientSumT>::ParallelSubtractionHist(
                                  BuilderT* builder,
                                  const common::BlockedSpace2d& space,
-                                  const std::vector<ExpandEntryT>& nodes,
+                                  const std::vector<CPUExpandEntry>& nodes,
                                  const std::vector<CPUExpandEntry>& subtraction_nodes,
                                  const RegTree * p_tree) {
  common::ParallelFor2d(space, builder->nthread_, [&](size_t node, common::Range1d r) {
    const auto& entry = nodes[node];
    if (!((*p_tree)[entry.nid].IsLeftChild())) {
      auto this_hist = builder->hist_[entry.nid];
-      if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1) {
+      if (!(*p_tree)[entry.nid].IsRoot()) {
        const int subtraction_node_id = subtraction_nodes[node].nid;
        auto parent_hist = builder->hist_[(*p_tree)[entry.nid].Parent()];
-        auto sibling_hist = builder->hist_[entry.sibling_nid];
+        auto sibling_hist = builder->hist_[subtraction_node_id];
        SubtractionHist(this_hist, parent_hist, sibling_hist, r.begin(), r.end());
      }
    }
@ -287,18 +293,19 @@ void QuantileHistMaker::Builder<GradientSumT>::SetHistRowsAdder(
 }
 template <typename GradientSumT>
-void QuantileHistMaker::Builder<GradientSumT>::BuildHistogramsLossGuide(
+void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
-    ExpandEntry entry, const GHistIndexMatrix &gmat,
+    const GHistIndexMatrix &gmat,
-    const GHistIndexBlockMatrix &gmatb, RegTree *p_tree,
+    const GHistIndexBlockMatrix &gmatb,
-    const std::vector<GradientPair> &gpair_h) {
+    const DMatrix& fmat,
    RegTree *p_tree,
    const std::vector<GradientPair> &gpair_h,
    int *num_leaves, std::vector<CPUExpandEntry> *expand) {
  CPUExpandEntry node(CPUExpandEntry::kRootNid, p_tree->GetDepth(0), 0.0f);
  nodes_for_explicit_hist_build_.clear();
  nodes_for_subtraction_trick_.clear();
-  nodes_for_explicit_hist_build_.push_back(entry);
+  nodes_for_explicit_hist_build_.push_back(node);
  if (entry.sibling_nid > -1) {
    nodes_for_subtraction_trick_.emplace_back(entry.sibling_nid, entry.nid,
        p_tree->GetDepth(entry.sibling_nid), 0.0f, 0);
  }
  int starting_index = std::numeric_limits<int>::max();
  int sync_count = 0;
@ -306,6 +313,13 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildHistogramsLossGuide(
  hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
  BuildLocalHistograms(gmat, gmatb, p_tree, gpair_h);
  hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
  this->InitNewNode(CPUExpandEntry::kRootNid, gmat, gpair_h, fmat, *p_tree);
  this->EvaluateSplits({node}, gmat, hist_, *p_tree);
  node.loss_chg = snode_[CPUExpandEntry::kRootNid].best.loss_chg;
  expand->push_back(node);
  ++(*num_leaves);
 }
 template<typename GradientSumT>
@ -347,48 +361,17 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildLocalHistograms(
  builder_monitor_.Stop("BuildLocalHistograms");
 }
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::BuildNodeStats(
    const GHistIndexMatrix &gmat,
    DMatrix *p_fmat,
    RegTree *p_tree,
    const std::vector<GradientPair> &gpair_h) {
  builder_monitor_.Start("BuildNodeStats");
  for (auto const& entry : qexpand_depth_wise_) {
    int nid = entry.nid;
    this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
    // add constraints
    if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) {
      // it's a right child
      auto parent_id = (*p_tree)[nid].Parent();
      auto left_sibling_id = (*p_tree)[parent_id].LeftChild();
      auto parent_split_feature_id = snode_[parent_id].best.SplitIndex();
      tree_evaluator_.AddSplit(
          parent_id, left_sibling_id, nid, parent_split_feature_id,
          snode_[left_sibling_id].weight, snode_[nid].weight);
      interaction_constraints_.Split(parent_id, parent_split_feature_id,
                                     left_sibling_id, nid);
    }
  }
  builder_monitor_.Stop("BuildNodeStats");
 }
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToTree(
-          const GHistIndexMatrix &gmat,
+          const std::vector<CPUExpandEntry>& expand,
          RegTree *p_tree,
          int *num_leaves,
-          int depth,
+          std::vector<CPUExpandEntry>* nodes_for_apply_split) {
          unsigned *timestamp,
          std::vector<ExpandEntry>* nodes_for_apply_split,
          std::vector<ExpandEntry>* temp_qexpand_depth) {
  auto evaluator = tree_evaluator_.GetEvaluator();
-  for (auto const& entry : qexpand_depth_wise_) {
+  for (auto const& entry : expand) {
    int nid = entry.nid;
-    if (snode_[nid].best.loss_chg < kRtEps ||
+    if (entry.IsValid(param_, *num_leaves)) {
        (param_.max_depth > 0 && depth == param_.max_depth) ||
        (param_.max_leaves > 0 && (*num_leaves) == param_.max_leaves)) {
      (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
    } else {
      nodes_for_apply_split->push_back(entry);
@ -402,36 +385,12 @@ void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToTree(
                         e.best.DefaultLeft(), e.weight, left_leaf_weight,
                         right_leaf_weight, e.best.loss_chg, e.stats.GetHess(),
                         e.best.left_sum.GetHess(), e.best.right_sum.GetHess());
      int left_id = (*p_tree)[nid].LeftChild();
      int right_id = (*p_tree)[nid].RightChild();
      temp_qexpand_depth->push_back(ExpandEntry(left_id, right_id,
                                                p_tree->GetDepth(left_id), 0.0, (*timestamp)++));
      temp_qexpand_depth->push_back(ExpandEntry(right_id, left_id,
                                                p_tree->GetDepth(right_id), 0.0, (*timestamp)++));
      // - 1 parent + 2 new children
      (*num_leaves)++;
    }
  }
 }
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::EvaluateAndApplySplits(
    const GHistIndexMatrix &gmat,
    const ColumnMatrix &column_matrix,
    RegTree *p_tree,
    int *num_leaves,
    int depth,
    unsigned *timestamp,
    std::vector<ExpandEntry> *temp_qexpand_depth) {
  EvaluateSplits(qexpand_depth_wise_, gmat, hist_, *p_tree);
  std::vector<ExpandEntry> nodes_for_apply_split;
  AddSplitsToTree(gmat, p_tree, num_leaves, depth, timestamp,
                  &nodes_for_apply_split, temp_qexpand_depth);
  ApplySplit(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree);
 }
 // Split nodes to 2 sets depending on amount of rows in each node
 // Histograms for small nodes will be built explicitly
 // Histograms for big nodes will be built by 'Subtraction Trick'
@ -440,148 +399,109 @@ void QuantileHistMaker::Builder<GradientSumT>::EvaluateAndApplySplits(
 //    This ensures that the workers operate on the same set of tree nodes.
 template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::SplitSiblings(
-    const std::vector<ExpandEntry> &nodes,
+    const std::vector<CPUExpandEntry> &nodes_for_apply_split,
-    std::vector<ExpandEntry> *small_siblings,
+    std::vector<CPUExpandEntry> *nodes_to_evaluate, RegTree *p_tree) {
    std::vector<ExpandEntry> *big_siblings, RegTree *p_tree) {
  builder_monitor_.Start("SplitSiblings");
-  for (auto const& entry : nodes) {
+  for (auto const& entry : nodes_for_apply_split) {
    int nid = entry.nid;
    RegTree::Node &node = (*p_tree)[nid];
    if (node.IsRoot()) {
      small_siblings->push_back(entry);
    } else {
      const int32_t left_id = (*p_tree)[node.Parent()].LeftChild();
      const int32_t right_id = (*p_tree)[node.Parent()].RightChild();
      if (nid == left_id && row_set_collection_[left_id ].Size() <
                            row_set_collection_[right_id].Size()) {
        small_siblings->push_back(entry);
      } else if (nid == right_id && row_set_collection_[right_id].Size() <=
                                    row_set_collection_[left_id ].Size()) {
        small_siblings->push_back(entry);
      } else {
        big_siblings->push_back(entry);
      }
    }
  }
  builder_monitor_.Stop("SplitSiblings");
 }
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::ExpandWithDepthWise(
  const GHistIndexMatrix &gmat,
  const GHistIndexBlockMatrix &gmatb,
  const ColumnMatrix &column_matrix,
  DMatrix *p_fmat,
  RegTree *p_tree,
  const std::vector<GradientPair> &gpair_h) {
  unsigned timestamp = 0;
  int num_leaves = 0;
  // in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway
  qexpand_depth_wise_.emplace_back(ExpandEntry(ExpandEntry::kRootNid, ExpandEntry::kEmptyNid,
      p_tree->GetDepth(ExpandEntry::kRootNid), 0.0, timestamp++));
  ++num_leaves;
  for (int depth = 0; depth < param_.max_depth + 1; depth++) {
    int starting_index = std::numeric_limits<int>::max();
    int sync_count = 0;
    std::vector<ExpandEntry> temp_qexpand_depth;
    SplitSiblings(qexpand_depth_wise_, &nodes_for_explicit_hist_build_,
                  &nodes_for_subtraction_trick_, p_tree);
    hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
    BuildLocalHistograms(gmat, gmatb, p_tree, gpair_h);
    hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
    BuildNodeStats(gmat, p_fmat, p_tree, gpair_h);
    EvaluateAndApplySplits(gmat, column_matrix, p_tree, &num_leaves, depth, &timestamp,
                   &temp_qexpand_depth);
    // clean up
    qexpand_depth_wise_.clear();
    nodes_for_subtraction_trick_.clear();
    nodes_for_explicit_hist_build_.clear();
    if (temp_qexpand_depth.empty()) {
      break;
    } else {
      qexpand_depth_wise_ = temp_qexpand_depth;
      temp_qexpand_depth.clear();
    }
  }
 }
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::ExpandWithLossGuide(
    const GHistIndexMatrix& gmat,
    const GHistIndexBlockMatrix& gmatb,
    const ColumnMatrix& column_matrix,
    DMatrix* p_fmat,
    RegTree* p_tree,
    const std::vector<GradientPair>& gpair_h) {
  builder_monitor_.Start("ExpandWithLossGuide");
  unsigned timestamp = 0;
  int num_leaves = 0;
  ExpandEntry node(ExpandEntry::kRootNid, ExpandEntry::kEmptyNid,
      p_tree->GetDepth(0), 0.0f, timestamp++);
  BuildHistogramsLossGuide(node, gmat, gmatb, p_tree, gpair_h);
  this->InitNewNode(ExpandEntry::kRootNid, gmat, gpair_h, *p_fmat, *p_tree);
  this->EvaluateSplits({node}, gmat, hist_, *p_tree);
  node.loss_chg = snode_[ExpandEntry::kRootNid].best.loss_chg;
  qexpand_loss_guided_->push(node);
  ++num_leaves;
  while (!qexpand_loss_guided_->empty()) {
    const ExpandEntry candidate = qexpand_loss_guided_->top();
    const int nid = candidate.nid;
    qexpand_loss_guided_->pop();
    if (candidate.IsValid(param_, num_leaves)) {
      (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
    } else {
      auto evaluator = tree_evaluator_.GetEvaluator();
      NodeEntry& e = snode_[nid];
      bst_float left_leaf_weight =
          evaluator.CalcWeight(nid, param_, GradStats{e.best.left_sum}) * param_.learning_rate;
      bst_float right_leaf_weight =
          evaluator.CalcWeight(nid, param_, GradStats{e.best.right_sum}) * param_.learning_rate;
      p_tree->ExpandNode(nid, e.best.SplitIndex(), e.best.split_value,
                         e.best.DefaultLeft(), e.weight, left_leaf_weight,
                         right_leaf_weight, e.best.loss_chg, e.stats.GetHess(),
                         e.best.left_sum.GetHess(), e.best.right_sum.GetHess());
      this->ApplySplit({candidate}, gmat, column_matrix, hist_, p_tree);
    const int cleft = (*p_tree)[nid].LeftChild();
    const int cright = (*p_tree)[nid].RightChild();
-
+    const CPUExpandEntry left_node = CPUExpandEntry(cleft, p_tree->GetDepth(cleft), 0.0);
-      ExpandEntry left_node(cleft, cright, p_tree->GetDepth(cleft),
+    const CPUExpandEntry right_node = CPUExpandEntry(cright, p_tree->GetDepth(cright), 0.0);
-                            0.0f, timestamp++);
+    nodes_to_evaluate->push_back(left_node);
-      ExpandEntry right_node(cright, cleft, p_tree->GetDepth(cright),
+    nodes_to_evaluate->push_back(right_node);
                            0.0f, timestamp++);
    if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
-        BuildHistogramsLossGuide(left_node, gmat, gmatb, p_tree, gpair_h);
+      nodes_for_explicit_hist_build_.push_back(left_node);
      nodes_for_subtraction_trick_.push_back(right_node);
    } else {
-        BuildHistogramsLossGuide(right_node, gmat, gmatb, p_tree, gpair_h);
+      nodes_for_explicit_hist_build_.push_back(right_node);
      nodes_for_subtraction_trick_.push_back(left_node);
    }
  }
  CHECK_EQ(nodes_for_subtraction_trick_.size(), nodes_for_explicit_hist_build_.size());
  builder_monitor_.Stop("SplitSiblings");
 }
-      this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
+template <typename GradientSumT>
-      this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
+void QuantileHistMaker::Builder<GradientSumT>::BuildNodeStats(
  const GHistIndexMatrix &gmat,
  const DMatrix& fmat,
  const std::vector<GradientPair> &gpair_h,
  const std::vector<CPUExpandEntry>& nodes_for_apply_split, RegTree *p_tree) {
  for (auto const& candidate : nodes_for_apply_split) {
    const int nid = candidate.nid;
    const int cleft = (*p_tree)[nid].LeftChild();
    const int cright = (*p_tree)[nid].RightChild();
    InitNewNode(cleft, gmat, gpair_h, fmat, *p_tree);
    InitNewNode(cright, gmat, gpair_h, fmat, *p_tree);
    bst_uint featureid = snode_[nid].best.SplitIndex();
    tree_evaluator_.AddSplit(nid, cleft, cright, featureid,
                            snode_[cleft].weight, snode_[cright].weight);
    interaction_constraints_.Split(nid, featureid, cleft, cright);
  }
 }
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
    const GHistIndexMatrix& gmat,
    const GHistIndexBlockMatrix& gmatb,
    const ColumnMatrix& column_matrix,
    DMatrix* p_fmat,
    RegTree* p_tree,
    const std::vector<GradientPair>& gpair_h) {
  builder_monitor_.Start("ExpandTree");
  int num_leaves = 0;
  Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
  std::vector<CPUExpandEntry> expand;
  InitRoot(gmat, gmatb, *p_fmat, p_tree, gpair_h, &num_leaves, &expand);
  driver.Push(expand[0]);
  int depth = 0;
  while (!driver.IsEmpty()) {
    expand = driver.Pop();
    depth = expand[0].depth + 1;
    std::vector<CPUExpandEntry> nodes_for_apply_split;
    std::vector<CPUExpandEntry> nodes_to_evaluate;
    nodes_for_explicit_hist_build_.clear();
    nodes_for_subtraction_trick_.clear();
    AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split);
    if (nodes_for_apply_split.size() != 0) {
      ApplySplit(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree);
      SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree);
      int starting_index = std::numeric_limits<int>::max();
      int sync_count = 0;
      hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
      if (depth < param_.max_depth) {
        BuildLocalHistograms(gmat, gmatb, p_tree, gpair_h);
        hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
      }
      BuildNodeStats(gmat, *p_fmat, gpair_h, nodes_for_apply_split, p_tree);
      EvaluateSplits(nodes_to_evaluate, gmat, hist_, *p_tree);
      for (size_t i = 0; i < nodes_for_apply_split.size(); ++i) {
        const CPUExpandEntry candidate = nodes_for_apply_split[i];
        const int nid = candidate.nid;
        const int cleft = (*p_tree)[nid].LeftChild();
        const int cright = (*p_tree)[nid].RightChild();
        CPUExpandEntry left_node = nodes_to_evaluate[i*2 + 0];
        CPUExpandEntry right_node = nodes_to_evaluate[i*2 + 1];
      this->EvaluateSplits({left_node, right_node}, gmat, hist_, *p_tree);
        left_node.loss_chg = snode_[cleft].best.loss_chg;
        right_node.loss_chg = snode_[cright].best.loss_chg;
-      qexpand_loss_guided_->push(left_node);
+        driver.Push(left_node);
-      qexpand_loss_guided_->push(right_node);
+        driver.Push(right_node);
      ++num_leaves;  // give two and take one, as parent is no longer a leaf
      }
    }
-  builder_monitor_.Stop("ExpandWithLossGuide");
+  }
  builder_monitor_.Stop("ExpandTree");
 }
 template <typename GradientSumT>
@ -604,11 +524,8 @@ void QuantileHistMaker::Builder<GradientSumT>::Update(
  p_last_fmat_mutable_ = p_fmat;
  this->InitData(gmat, *p_fmat, *p_tree, gpair_ptr);
-  if (param_.grow_policy == TrainParam::kLossGuide) {
+
-    ExpandWithLossGuide(gmat, gmatb, column_matrix, p_fmat, p_tree, *gpair_ptr);
+  ExpandTree(gmat, gmatb, column_matrix, p_fmat, p_tree, *gpair_ptr);
  } else {
    ExpandWithDepthWise(gmat, gmatb, column_matrix, p_fmat, p_tree, *gpair_ptr);
  }
  for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
    p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
@ -871,13 +788,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
    snode_.reserve(256);
    snode_.clear();
  }
-  {
+
    if (param_.grow_policy == TrainParam::kLossGuide) {
      qexpand_loss_guided_.reset(new ExpandQueue(LossGuide));
    } else {
      qexpand_depth_wise_.clear();
    }
  }
  builder_monitor_.Stop("InitData");
 }
@ -898,7 +809,7 @@ bool QuantileHistMaker::Builder<GradientSumT>::SplitContainsMissingValues(
 // nodes_set - set of nodes to be processed in parallel
 template<typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::EvaluateSplits(
-                                               const std::vector<ExpandEntry>& nodes_set,
+                                               const std::vector<CPUExpandEntry>& nodes_set,
                                               const GHistIndexMatrix& gmat,
                                               const HistCollection<GradientSumT>& hist,
                                               const RegTree& tree) {
@ -1123,7 +1034,7 @@ void QuantileHistMaker::Builder<GradientSumT>::PartitionKernel(
 template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::FindSplitConditions(
-                                                     const std::vector<ExpandEntry>& nodes,
+                                                     const std::vector<CPUExpandEntry>& nodes,
                                                     const RegTree& tree,
                                                     const GHistIndexMatrix& gmat,
                                                     std::vector<int32_t>* split_conditions) {
@ -1151,7 +1062,7 @@ void QuantileHistMaker::Builder<GradientSumT>::FindSplitConditions(
 }
 template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToRowSet(
-                                               const std::vector<ExpandEntry>& nodes,
+                                               const std::vector<CPUExpandEntry>& nodes,
                                               RegTree* p_tree) {
  const size_t n_nodes = nodes.size();
  for (unsigned int i = 0; i < n_nodes; ++i) {
@ -1165,7 +1076,7 @@ void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToRowSet(
 }
 template <typename GradientSumT>
-void QuantileHistMaker::Builder<GradientSumT>::ApplySplit(const std::vector<ExpandEntry> nodes,
+void QuantileHistMaker::Builder<GradientSumT>::ApplySplit(const std::vector<CPUExpandEntry> nodes,
                                            const GHistIndexMatrix& gmat,
                                            const ColumnMatrix& column_matrix,
                                            const HistCollection<GradientSumT>& hist,
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@ -22,6 +22,7 @@
 #include "xgboost/json.h"
 #include "constraints.h"
 #include "./param.h"
 #include "./driver.h"
 #include "./split_evaluator.h"
 #include "../common/random.h"
 #include "../common/timer.h"
@ -148,6 +149,36 @@ struct CPUHistMakerTrainParam
  }
 };
 /* tree growing policies */
 struct CPUExpandEntry {
  static const int kRootNid  = 0;
  static const int kEmptyNid = -1;
  int nid;
  int depth;
  bst_float loss_chg;
  CPUExpandEntry(int nid, int depth, bst_float loss_chg)
      : nid(nid), depth(depth), loss_chg(loss_chg) {}
  bool IsValid(TrainParam const &param, int32_t num_leaves) const {
    bool ret = loss_chg <= kRtEps ||
               (param.max_depth > 0 && this->depth == param.max_depth) ||
               (param.max_leaves > 0 && num_leaves == param.max_leaves);
    return ret;
  }
  bst_float GetLossChange() const {
    return loss_chg;
  }
  int GetNodeId() const {
    return nid;
  }
  int GetDepth() const {
    return depth;
  }
 };
 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker: public TreeUpdater {
 public:
@ -299,28 +330,6 @@ class QuantileHistMaker: public TreeUpdater {
    friend class BatchHistRowsAdder<GradientSumT>;
    friend class DistributedHistRowsAdder<GradientSumT>;
    /* tree growing policies */
    struct ExpandEntry {
      static const int kRootNid  = 0;
      static const int kEmptyNid = -1;
      int nid;
      int sibling_nid;
      int depth;
      bst_float loss_chg;
      unsigned timestamp;
      ExpandEntry(int nid, int sibling_nid, int depth, bst_float loss_chg,
                  unsigned tstmp)
          : nid(nid), sibling_nid(sibling_nid), depth(depth),
            loss_chg(loss_chg), timestamp(tstmp) {}
      bool IsValid(TrainParam const &param, int32_t num_leaves) const {
        bool ret = loss_chg <= kRtEps ||
                   (param.max_depth > 0 && this->depth == param.max_depth) ||
                   (param.max_leaves > 0 && num_leaves == param.max_leaves);
        return ret;
      }
    };
    // initialize temp data structure
    void InitData(const GHistIndexMatrix& gmat,
                  const DMatrix& fmat,
@ -333,12 +342,12 @@ class QuantileHistMaker: public TreeUpdater {
                      std::vector<GradientPair>* gpair,
                      std::vector<size_t>* row_indices);
-    void EvaluateSplits(const std::vector<ExpandEntry>& nodes_set,
+    void EvaluateSplits(const std::vector<CPUExpandEntry>& nodes_set,
                        const GHistIndexMatrix& gmat,
                        const HistCollection<GradientSumT>& hist,
                        const RegTree& tree);
-    void ApplySplit(std::vector<ExpandEntry> nodes,
+    void ApplySplit(std::vector<CPUExpandEntry> nodes,
                        const GHistIndexMatrix& gmat,
                        const ColumnMatrix& column_matrix,
                        const HistCollection<GradientSumT>& hist,
@ -349,10 +358,10 @@ class QuantileHistMaker: public TreeUpdater {
                         const int32_t split_cond,
                         const ColumnMatrix& column_matrix, const RegTree& tree);
-    void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree* p_tree);
+    void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree* p_tree);
-    void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
+    void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
                             const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions);
    void InitNewNode(int nid,
@ -365,8 +374,7 @@ class QuantileHistMaker: public TreeUpdater {
    // Returns the sum of gradients corresponding to the data points that contains a non-missing
    // value for the particular feature fid.
    template <int d_step>
-    GradStats EnumerateSplit(
+    GradStats EnumerateSplit(const GHistIndexMatrix &gmat, const GHistRowT &hist,
        const GHistIndexMatrix &gmat, const GHistRowT &hist,
                             const NodeEntry &snode, SplitEntry *p_best, bst_uint fid,
                             bst_uint nodeID,
                             TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator) const;
@ -377,73 +385,42 @@ class QuantileHistMaker: public TreeUpdater {
    // else - there are missing values
    bool SplitContainsMissingValues(const GradStats e, const NodeEntry& snode);
    void ExpandWithDepthWise(const GHistIndexMatrix &gmat,
                             const GHistIndexBlockMatrix &gmatb,
                             const ColumnMatrix &column_matrix,
                             DMatrix *p_fmat,
                             RegTree *p_tree,
                             const std::vector<GradientPair> &gpair_h);
    void BuildLocalHistograms(const GHistIndexMatrix &gmat,
                              const GHistIndexBlockMatrix &gmatb,
                              RegTree *p_tree,
                              const std::vector<GradientPair> &gpair_h);
-    void BuildHistogramsLossGuide(
+    void InitRoot(const GHistIndexMatrix &gmat,
                        ExpandEntry entry,
                        const GHistIndexMatrix &gmat,
                  const GHistIndexBlockMatrix &gmatb,
                  const DMatrix& fmat,
                  RegTree *p_tree,
-                        const std::vector<GradientPair> &gpair_h);
+                  const std::vector<GradientPair> &gpair_h,
                  int *num_leaves, std::vector<CPUExpandEntry> *expand);
    // Split nodes to 2 sets depending on amount of rows in each node
    // Histograms for small nodes will be built explicitly
    // Histograms for big nodes will be built by 'Subtraction Trick'
-    void SplitSiblings(const std::vector<ExpandEntry>& nodes,
+    void SplitSiblings(const std::vector<CPUExpandEntry>& nodes,
-                   std::vector<ExpandEntry>* small_siblings,
+                       std::vector<CPUExpandEntry>* nodes_to_evaluate,
                   std::vector<ExpandEntry>* big_siblings,
                       RegTree *p_tree);
-    void ParallelSubtractionHist(const common::BlockedSpace2d& space,
+    void AddSplitsToTree(const std::vector<CPUExpandEntry>& expand,
-                                 const std::vector<ExpandEntry>& nodes,
+                         RegTree *p_tree,
-                                 const RegTree * p_tree);
+                         int *num_leaves,
                         std::vector<CPUExpandEntry>* nodes_for_apply_split);
    void BuildNodeStats(const GHistIndexMatrix &gmat,
-                        DMatrix *p_fmat,
+                        const DMatrix& fmat,
-                        RegTree *p_tree,
+                        const std::vector<GradientPair> &gpair_h,
-                        const std::vector<GradientPair> &gpair_h);
+                        const std::vector<CPUExpandEntry>& nodes_for_apply_split, RegTree *p_tree);
-    void EvaluateAndApplySplits(const GHistIndexMatrix &gmat,
+    void ExpandTree(const GHistIndexMatrix& gmat,
                                const ColumnMatrix &column_matrix,
                                RegTree *p_tree,
                                int *num_leaves,
                                int depth,
                                unsigned *timestamp,
                                std::vector<ExpandEntry> *temp_qexpand_depth);
    void AddSplitsToTree(
              const GHistIndexMatrix &gmat,
              RegTree *p_tree,
              int *num_leaves,
              int depth,
              unsigned *timestamp,
              std::vector<ExpandEntry>* nodes_for_apply_split,
              std::vector<ExpandEntry>* temp_qexpand_depth);
    void ExpandWithLossGuide(const GHistIndexMatrix& gmat,
                    const GHistIndexBlockMatrix& gmatb,
                    const ColumnMatrix& column_matrix,
                    DMatrix* p_fmat,
                    RegTree* p_tree,
                    const std::vector<GradientPair>& gpair_h);
    inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
      if (lhs.loss_chg == rhs.loss_chg) {
        return lhs.timestamp > rhs.timestamp;  // favor small timestamp
      } else {
        return lhs.loss_chg < rhs.loss_chg;  // favor large loss_chg
      }
    }
    //  --data fields--
    const size_t n_trees_;
    const TrainParam& param_;
@ -484,16 +461,14 @@ class QuantileHistMaker: public TreeUpdater {
    DMatrix* p_last_fmat_mutable_;
    using ExpandQueue =
-       std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
+       std::priority_queue<CPUExpandEntry, std::vector<CPUExpandEntry>,
-                           std::function<bool(ExpandEntry, ExpandEntry)>>;
+                           std::function<bool(CPUExpandEntry, CPUExpandEntry)>>;
    std::unique_ptr<ExpandQueue> qexpand_loss_guided_;
    std::vector<ExpandEntry> qexpand_depth_wise_;
    // key is the node id which should be calculated by Subtraction Trick, value is the node which
    // provides the evidence for subtraction
-    std::vector<ExpandEntry> nodes_for_subtraction_trick_;
+    std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
    // list of nodes whose histograms would be built explicitly.
-    std::vector<ExpandEntry> nodes_for_explicit_hist_build_;
+    std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
    enum class DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
    DataLayout data_layout_;
@ -549,14 +524,14 @@ template <typename GradientSumT>
 class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
 public:
  using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
  using ExpandEntryT = typename BuilderT::ExpandEntry;
  void SyncHistograms(BuilderT* builder, int starting_index,
                      int sync_count, RegTree *p_tree) override;
  void ParallelSubtractionHist(BuilderT* builder,
                               const common::BlockedSpace2d& space,
-                               const std::vector<ExpandEntryT>& nodes,
+                               const std::vector<CPUExpandEntry>& nodes,
                               const std::vector<CPUExpandEntry>& subtraction_nodes,
                               const RegTree * p_tree);
 };
--- a/tests/cpp/tree/gpu_hist/test_driver.cu
+++ b/tests/cpp/tree/gpu_hist/test_driver.cu
@ -1,20 +1,21 @@
 #include <gtest/gtest.h>
-#include "../../../../src/tree/gpu_hist/driver.cuh"
+#include "../../../../src/tree/driver.h"
 #include "../../../../src/tree/gpu_hist/expand_entry.cuh"
 namespace xgboost {
 namespace tree {
 TEST(GpuHist, DriverDepthWise) {
-  Driver driver(TrainParam::kDepthWise);
+  Driver<GPUExpandEntry> driver(TrainParam::kDepthWise);
  EXPECT_TRUE(driver.Pop().empty());
  DeviceSplitCandidate split;
  split.loss_chg = 1.0f;
-  ExpandEntry root(0, 0, split, .0f, .0f, .0f);
+  GPUExpandEntry root(0, 0, split, .0f, .0f, .0f);
  driver.Push({root});
  EXPECT_EQ(driver.Pop().front().nid, 0);
-  driver.Push({ExpandEntry{1, 1, split, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{1, 1, split, .0f, .0f, .0f}});
-  driver.Push({ExpandEntry{2, 1, split, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{2, 1, split, .0f, .0f, .0f}});
-  driver.Push({ExpandEntry{3, 2, split, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{3, 2, split, .0f, .0f, .0f}});
  // Should return entries from level 1
  auto res = driver.Pop();
  EXPECT_EQ(res.size(), 2);
@ -32,14 +33,14 @@ TEST(GpuHist, DriverLossGuided) {
  DeviceSplitCandidate low_gain;
  low_gain.loss_chg = 1.0f;
-  Driver driver(TrainParam::kLossGuide);
+  Driver<GPUExpandEntry> driver(TrainParam::kLossGuide);
  EXPECT_TRUE(driver.Pop().empty());
-  ExpandEntry root(0, 0, high_gain, .0f, .0f, .0f);
+  GPUExpandEntry root(0, 0, high_gain, .0f, .0f, .0f);
  driver.Push({root});
  EXPECT_EQ(driver.Pop().front().nid, 0);
  // Select high gain first
-  driver.Push({ExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
-  driver.Push({ExpandEntry{2, 2, high_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{2, 2, high_gain, .0f, .0f, .0f}});
  auto res = driver.Pop();
  EXPECT_EQ(res.size(), 1);
  EXPECT_EQ(res[0].nid, 2);
@ -48,8 +49,8 @@ TEST(GpuHist, DriverLossGuided) {
  EXPECT_EQ(res[0].nid, 1);
  // If equal gain, use nid
-  driver.Push({ExpandEntry{2, 1, low_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{2, 1, low_gain, .0f, .0f, .0f}});
-  driver.Push({ExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
  res = driver.Pop();
  EXPECT_EQ(res[0].nid, 1);
  res = driver.Pop();
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -132,7 +132,7 @@ TEST(GpuHist, BuildHistSharedMem) {
 TEST(GpuHist, ApplySplit) {
  RegTree tree;
-  ExpandEntry candidate;
+  GPUExpandEntry candidate;
  candidate.nid = 0;
  candidate.left_weight = 1.0f;
  candidate.right_weight = 2.0f;
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@ -24,7 +24,6 @@ class QuantileHistMock : public QuantileHistMaker {
  template <typename GradientSumT>
  struct BuilderMock : public QuantileHistMaker::Builder<GradientSumT> {
    using RealImpl = QuantileHistMaker::Builder<GradientSumT>;
    using ExpandEntryT = typename RealImpl::ExpandEntry;
    using GHistRowT = typename RealImpl::GHistRowT;
    BuilderMock(const TrainParam& param,
@ -169,19 +168,19 @@ class QuantileHistMock : public QuantileHistMaker {
      tree->ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
      tree->ExpandNode((*tree)[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
      tree->ExpandNode((*tree)[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-      this->nodes_for_explicit_hist_build_.emplace_back(3, 4, tree->GetDepth(3), 0.0f, 0);
+      this->nodes_for_explicit_hist_build_.emplace_back(3, tree->GetDepth(3), 0.0f);
-      this->nodes_for_explicit_hist_build_.emplace_back(4, 3, tree->GetDepth(4), 0.0f, 0);
+      this->nodes_for_explicit_hist_build_.emplace_back(4, tree->GetDepth(4), 0.0f);
-      this->nodes_for_subtraction_trick_.emplace_back(5, 6, tree->GetDepth(5), 0.0f, 0);
+      this->nodes_for_subtraction_trick_.emplace_back(5, tree->GetDepth(5), 0.0f);
-      this->nodes_for_subtraction_trick_.emplace_back(6, 5, tree->GetDepth(6), 0.0f, 0);
+      this->nodes_for_subtraction_trick_.emplace_back(6, tree->GetDepth(6), 0.0f);
      this->hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, tree);
      ASSERT_EQ(sync_count, 2);
      ASSERT_EQ(starting_index, 3);
-      for (const ExpandEntryT& node : this->nodes_for_explicit_hist_build_) {
+      for (const CPUExpandEntry& node : this->nodes_for_explicit_hist_build_) {
        ASSERT_EQ(this->hist_.RowExists(node.nid), true);
      }
-      for (const ExpandEntryT& node : this->nodes_for_subtraction_trick_) {
+      for (const CPUExpandEntry& node : this->nodes_for_subtraction_trick_) {
        ASSERT_EQ(this->hist_.RowExists(node.nid), true);
      }
    }
@ -199,7 +198,7 @@ class QuantileHistMock : public QuantileHistMaker {
      this->nodes_for_explicit_hist_build_.clear();
      this->nodes_for_subtraction_trick_.clear();
      // level 0
-      this->nodes_for_explicit_hist_build_.emplace_back(0, -1, tree->GetDepth(0), 0.0f, 0);
+      this->nodes_for_explicit_hist_build_.emplace_back(0, tree->GetDepth(0), 0.0f);
      this->hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, tree);
      tree->ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
@ -207,11 +206,9 @@ class QuantileHistMock : public QuantileHistMaker {
      this->nodes_for_subtraction_trick_.clear();
      // level 1
      this->nodes_for_explicit_hist_build_.emplace_back((*tree)[0].LeftChild(),
-                                                (*tree)[0].RightChild(),
+                                                tree->GetDepth(1), 0.0f);
                                                tree->GetDepth(1), 0.0f, 0);
      this->nodes_for_subtraction_trick_.emplace_back((*tree)[0].RightChild(),
-                                              (*tree)[0].LeftChild(),
+                                              tree->GetDepth(2), 0.0f);
                                              tree->GetDepth(2), 0.0f, 0);
      this->hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, tree);
      tree->ExpandNode((*tree)[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
      tree->ExpandNode((*tree)[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
@ -219,10 +216,10 @@ class QuantileHistMock : public QuantileHistMaker {
      this->nodes_for_explicit_hist_build_.clear();
      this->nodes_for_subtraction_trick_.clear();
      // level 2
-      this->nodes_for_explicit_hist_build_.emplace_back(3, 4, tree->GetDepth(3), 0.0f, 0);
+      this->nodes_for_explicit_hist_build_.emplace_back(3, tree->GetDepth(3), 0.0f);
-      this->nodes_for_subtraction_trick_.emplace_back(4, 3, tree->GetDepth(4), 0.0f, 0);
+      this->nodes_for_subtraction_trick_.emplace_back(4, tree->GetDepth(4), 0.0f);
-      this->nodes_for_explicit_hist_build_.emplace_back(5, 6, tree->GetDepth(5), 0.0f, 0);
+      this->nodes_for_explicit_hist_build_.emplace_back(5, tree->GetDepth(5), 0.0f);
-      this->nodes_for_subtraction_trick_.emplace_back(6, 5, tree->GetDepth(6), 0.0f, 0);
+      this->nodes_for_subtraction_trick_.emplace_back(6, tree->GetDepth(6), 0.0f);
      this->hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, tree);
      const size_t n_nodes = this->nodes_for_explicit_hist_build_.size();
@ -278,21 +275,27 @@ class QuantileHistMock : public QuantileHistMaker {
          ASSERT_EQ(p_parent[i], p_left[i] + p_right[i]);
        }
      };
-      for (const ExpandEntryT& node : this->nodes_for_explicit_hist_build_) {
+      size_t node_id = 0;
      for (const CPUExpandEntry& node : this->nodes_for_explicit_hist_build_) {
        auto this_hist = this->hist_[node.nid];
        const size_t parent_id = (*tree)[node.nid].Parent();
        const size_t subtraction_node_id = this->nodes_for_subtraction_trick_[node_id].nid;
        auto parent_hist =  this->hist_[parent_id];
-        auto sibling_hist = this->hist_[node.sibling_nid];
+        auto sibling_hist = this->hist_[subtraction_node_id];
        check_hist(parent_hist, this_hist, sibling_hist, 0, nbins);
        ++node_id;
      }
-      for (const ExpandEntryT& node : this->nodes_for_subtraction_trick_) {
+      node_id = 0;
      for (const CPUExpandEntry& node : this->nodes_for_subtraction_trick_) {
        auto this_hist = this->hist_[node.nid];
        const size_t parent_id = (*tree)[node.nid].Parent();
        const size_t subtraction_node_id = this->nodes_for_explicit_hist_build_[node_id].nid;
        auto parent_hist =  this->hist_[parent_id];
-        auto sibling_hist = this->hist_[node.sibling_nid];
+        auto sibling_hist = this->hist_[subtraction_node_id];
        check_hist(parent_hist, this_hist, sibling_hist, 0, nbins);
        ++node_id;
      }
    }
@ -408,10 +411,9 @@ class QuantileHistMock : public QuantileHistMaker {
      }
      /* Now compare against result given by EvaluateSplit() */
-      typename RealImpl::ExpandEntry node(RealImpl::ExpandEntry::kRootNid,
+      CPUExpandEntry node(CPUExpandEntry::kRootNid,
                                          RealImpl::ExpandEntry::kEmptyNid,
                          tree.GetDepth(0),
-                                          this->snode_[0].best.loss_chg, 0);
+                          this->snode_[0].best.loss_chg);
      RealImpl::EvaluateSplits({node}, gmat, this->hist_, tree);
      ASSERT_EQ(this->snode_[0].best.SplitIndex(), best_split_feature);
      ASSERT_EQ(this->snode_[0].best.split_value, gmat.cut.Values()[best_split_threshold]);