[POC] Experimental support for l1 error. (#7812)

Support adaptive tree, a feature supported by both sklearn and lightgbm. The tree leaf is recomputed based on residue of labels and predictions after construction. For l1 error, the optimal value is the median (50 percentile). This is marked as experimental support for the following reasons: - The value is not well defined for distributed training, where we might have empty leaves for local workers. Right now I just use the original leaf value for computing the average with other workers, which might cause significant errors. - Some follow-ups are required, for exact, pruner, and optimization for quantile function. Also, we need to calculate the initial estimation.
2022-04-26 21:41:55 +08:00
parent ad06172c6b
commit fdf533f2b9
64 changed files with 1727 additions and 336 deletions
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -1,13 +1,17 @@
 /*!
- * Copyright 2017-2019 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
 */
 #pragma once
+#include <limits>
+#include <vector>
 #include "xgboost/base.h"
 #include "../../common/device_helpers.cuh"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/task.h"
+#include "xgboost/tree_model.h"

 namespace xgboost {
 namespace tree {
-
 /*! \brief Count how many rows are assigned to left node. */
 __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment) {
 #if __CUDACC_VER_MAJOR__ > 8
@@ -149,23 +153,48 @@ class RowPartitioner {
  }

  /**
-   * \brief Finalise the position of all training instances after tree
-   * construction is complete. Does not update any other meta information in
-   * this data structure, so should only be used at the end of training.
+   * \brief Finalise the position of all training instances after tree construction is
+   * complete. Does not update any other meta information in this data structure, so
+   * should only be used at the end of training.
   *
-   * \param op          Device lambda. Should provide the row index  and current
-   * position as an argument and return the new position for this training
-   * instance.
+   *   When the task requires update leaf, this function will copy the node index into
+   *   p_out_position. The index is negated if it's being sampled in current iteration.
+   *
+   * \param p_out_position Node index for each row.
+   * \param op Device lambda. Should provide the row index and current position as an
+   *           argument and return the new position for this training instance.
+   * \param sampled A device lambda to inform the partitioner whether a row is sampled.
   */
-  template <typename FinalisePositionOpT>
-  void FinalisePosition(FinalisePositionOpT op) {
+  template <typename FinalisePositionOpT, typename Sampledp>
+  void FinalisePosition(Context const* ctx, ObjInfo task,
+                        HostDeviceVector<bst_node_t>* p_out_position, FinalisePositionOpT op,
+                        Sampledp sampledp) {
    auto d_position = position_.Current();
    const auto d_ridx = ridx_.Current();
+    if (!task.UpdateTreeLeaf()) {
+      dh::LaunchN(position_.Size(), [=] __device__(size_t idx) {
+        auto position = d_position[idx];
+        RowIndexT ridx = d_ridx[idx];
+        bst_node_t new_position = op(ridx, position);
+        if (new_position == kIgnoredTreePosition) {
+          return;
+        }
+        d_position[idx] = new_position;
+      });
+      return;
+    }
+
+    p_out_position->SetDevice(ctx->gpu_id);
+    p_out_position->Resize(position_.Size());
+    auto sorted_position = p_out_position->DevicePointer();
    dh::LaunchN(position_.Size(), [=] __device__(size_t idx) {
      auto position = d_position[idx];
      RowIndexT ridx = d_ridx[idx];
      bst_node_t new_position = op(ridx, position);
-      if (new_position == kIgnoredTreePosition) return;
+      sorted_position[ridx] = sampledp(ridx) ? ~new_position : new_position;
+      if (new_position == kIgnoredTreePosition) {
+        return;
+      }
      d_position[idx] = new_position;
    });
  }
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -390,7 +390,6 @@ void UpdatePredictionCacheImpl(GenericParameter const *ctx, RegTree const *p_las

  CHECK(p_last_tree);
  auto const &tree = *p_last_tree;
-  auto const &snode = hist_evaluator.Stats();
  auto evaluator = hist_evaluator.Evaluator();
  CHECK_EQ(out_preds.DeviceIdx(), GenericParameter::kCpuId);
  size_t n_nodes = p_last_tree->GetNodes().size();
@@ -401,9 +400,7 @@ void UpdatePredictionCacheImpl(GenericParameter const *ctx, RegTree const *p_las
    common::ParallelFor2d(space, ctx->Threads(), [&](size_t nidx, common::Range1d r) {
      if (!tree[nidx].IsDeleted() && tree[nidx].IsLeaf()) {
        auto const &rowset = part[nidx];
-        auto const &stats = snode[nidx];
-        auto leaf_value =
-            evaluator.CalcWeight(nidx, param, GradStats{stats.stats}) * param.learning_rate;
+        auto leaf_value = tree[nidx].LeafValue();
        for (const size_t *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
          out_preds(*it) += leaf_value;
        }
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -19,6 +19,7 @@
 #include "param.h"
 #include "xgboost/base.h"
 #include "xgboost/json.h"
+#include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"

 namespace xgboost {
@@ -154,6 +155,18 @@ class GloablApproxBuilder {
    monitor_->Stop(__func__);
  }

+  void LeafPartition(RegTree const &tree, common::Span<float> hess,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!evaluator_.Task().UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, hess, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+
 public:
  explicit GloablApproxBuilder(TrainParam param, MetaInfo const &info, GenericParameter const *ctx,
                               std::shared_ptr<common::ColumnSampler> column_sampler, ObjInfo task,
@@ -164,8 +177,8 @@ class GloablApproxBuilder {
        ctx_{ctx},
        monitor_{monitor} {}

-  void UpdateTree(RegTree *p_tree, std::vector<GradientPair> const &gpair, common::Span<float> hess,
-                  DMatrix *p_fmat) {
+  void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
+                  RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
    p_last_tree_ = p_tree;
    this->InitData(p_fmat, hess);

@@ -231,6 +244,9 @@ class GloablApproxBuilder {
      driver.Push(best_splits.begin(), best_splits.end());
      expand_set = driver.Pop();
    }
+
+    auto &h_position = p_out_position->HostVector();
+    this->LeafPartition(tree, hess, &h_position);
  }
 };

@@ -275,6 +291,7 @@ class GlobalApproxUpdater : public TreeUpdater {
    sampled->resize(h_gpair.size());
    std::copy(h_gpair.cbegin(), h_gpair.cend(), sampled->begin());
    auto &rnd = common::GlobalRandom();
+
    if (param.subsample != 1.0) {
      CHECK(param.sampling_method != TrainParam::kGradientBased)
          << "Gradient based sampling is not supported for approx tree method.";
@@ -292,6 +309,7 @@ class GlobalApproxUpdater : public TreeUpdater {
  char const *Name() const override { return "grow_histmaker"; }

  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *m,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree *> &trees) override {
    float lr = param_.learning_rate;
    param_.learning_rate = lr / trees.size();
@@ -313,12 +331,14 @@ class GlobalApproxUpdater : public TreeUpdater {

    cached_ = m;

+    size_t t_idx = 0;
    for (auto p_tree : trees) {
      if (hist_param_.single_precision_histogram) {
-        this->f32_impl_->UpdateTree(p_tree, h_gpair, hess, m);
+        this->f32_impl_->UpdateTree(m, h_gpair, hess, p_tree, &out_position[t_idx]);
      } else {
-        this->f64_impl_->UpdateTree(p_tree, h_gpair, hess, m);
+        this->f64_impl_->UpdateTree(m, h_gpair, hess, p_tree, &out_position[t_idx]);
      }
+      ++t_idx;
    }
    param_.learning_rate = lr;
  }
@@ -335,6 +355,8 @@ class GlobalApproxUpdater : public TreeUpdater {
    }
    return true;
  }
+
+  bool HasNodePosition() const override { return true; }
 };

 DMLC_REGISTRY_FILE_TAG(grow_histmaker);
--- a/src/tree/updater_approx.h
+++ b/src/tree/updater_approx.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2022 XGBoost contributors
 *
 * \brief Implementation for the approx tree method.
 */
@@ -18,6 +18,7 @@
 #include "hist/expand_entry.h"
 #include "hist/param.h"
 #include "param.h"
+#include "xgboost/generic_parameters.h"
 #include "xgboost/json.h"
 #include "xgboost/tree_updater.h"

@@ -122,6 +123,12 @@ class ApproxRowPartitioner {

  auto const &Partitions() const { return row_set_collection_; }

+  void LeafPartition(Context const *ctx, RegTree const &tree, common::Span<float const> hess,
+                     std::vector<bst_node_t> *p_out_position) const {
+    partition_builder_.LeafPartition(ctx, tree, this->Partitions(), p_out_position,
+                                     [&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
+  }
+
  auto operator[](bst_node_t nidx) { return row_set_collection_[nidx]; }
  auto const &operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; }

--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -96,9 +96,9 @@ class ColMaker: public TreeUpdater {
    }
  }

-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix* dmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
    if (rabit::IsDistributed()) {
      LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't "
                    "support distributed training.";
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -11,6 +11,9 @@
 #include <utility>
 #include <vector>

+#include "xgboost/base.h"
+#include "xgboost/data.h"
+#include "xgboost/generic_parameters.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/parameter.h"
 #include "xgboost/span.h"
@@ -35,6 +38,8 @@
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/evaluate_splits.cuh"
 #include "gpu_hist/expand_entry.cuh"
+#include "xgboost/task.h"
+#include "xgboost/tree_model.h"

 namespace xgboost {
 namespace tree {
@@ -161,9 +166,9 @@ template <typename GradientSumT>
 struct GPUHistMakerDevice {
 private:
  GPUHistEvaluator<GradientSumT> evaluator_;
+  Context const* ctx_;

 public:
-  int device_id;
  EllpackPageImpl const* page;
  common::Span<FeatureType const> feature_types;
  BatchParam batch_param;
@@ -195,12 +200,12 @@ struct GPUHistMakerDevice {
  // Storing split categories for last node.
  dh::caching_device_vector<uint32_t> node_categories;

-  GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page,
+  GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
                     common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
                     TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
                     BatchParam _batch_param)
-      : evaluator_{_param, n_features, _device_id},
-        device_id(_device_id),
+      : evaluator_{_param, n_features, ctx->gpu_id},
+        ctx_(ctx),
        page(_page),
        feature_types{_feature_types},
        param(std::move(_param)),
@@ -216,14 +221,15 @@ struct GPUHistMakerDevice {
    node_sum_gradients.resize(param.MaxNodes());

    // Init histogram
-    hist.Init(device_id, page->Cuts().TotalBins());
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(device_id));
-    feature_groups.reset(new FeatureGroups(
-        page->Cuts(), page->is_dense, dh::MaxSharedMemoryOptin(device_id), sizeof(GradientSumT)));
+    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
+                                           dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                           sizeof(GradientSumT)));
  }

  ~GPUHistMakerDevice() {  // NOLINT
-    dh::safe_cuda(cudaSetDevice(device_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
  }

  // Reset values for each update iteration
@@ -235,10 +241,10 @@ struct GPUHistMakerDevice {
    this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
                              param.colsample_bynode, param.colsample_bylevel,
                              param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(device_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));

    this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
-                           device_id);
+                           ctx_->gpu_id);

    this->interaction_constraints.Reset();
    std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPairPrecise{});
@@ -256,7 +262,7 @@ struct GPUHistMakerDevice {
    histogram_rounding = CreateRoundingFactor<GradientSumT>(this->gpair);

    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(device_id,  sample.sample_rows));
+    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id,  sample.sample_rows));
    hist.Reset();
  }

@@ -264,10 +270,10 @@ struct GPUHistMakerDevice {
    int nidx = RegTree::kRoot;
    GPUTrainingParam gpu_param(param);
    auto sampled_features = column_sampler.GetFeatureSet(0);
-    sampled_features->SetDevice(device_id);
+    sampled_features->SetDevice(ctx_->gpu_id);
    common::Span<bst_feature_t> feature_set =
        interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(device_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
    EvaluateSplitInputs<GradientSumT> inputs{nidx,
                                             root_sum,
                                             gpu_param,
@@ -287,14 +293,14 @@ struct GPUHistMakerDevice {
    dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
    GPUTrainingParam gpu_param(param);
    auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
-    left_sampled_features->SetDevice(device_id);
+    left_sampled_features->SetDevice(ctx_->gpu_id);
    common::Span<bst_feature_t> left_feature_set =
        interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
    auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
-    right_sampled_features->SetDevice(device_id);
+    right_sampled_features->SetDevice(ctx_->gpu_id);
    common::Span<bst_feature_t> right_feature_set =
        interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
-    auto matrix = page->GetDeviceAccessor(device_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);

    EvaluateSplitInputs<GradientSumT> left{left_nidx,
                                           candidate.split.left_sum,
@@ -325,8 +331,8 @@ struct GPUHistMakerDevice {
    hist.AllocateHistogram(nidx);
    auto d_node_hist = hist.GetNodeHistogram(nidx);
    auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(page->GetDeviceAccessor(device_id),
-                           feature_groups->DeviceAccessor(device_id), gpair,
+    BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id),
+                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair,
                           d_ridx, d_node_hist, histogram_rounding);
  }

@@ -351,7 +357,7 @@ struct GPUHistMakerDevice {
  void UpdatePosition(int nidx, RegTree* p_tree) {
    RegTree::Node split_node = (*p_tree)[nidx];
    auto split_type = p_tree->NodeSplitType(nidx);
-    auto d_matrix = page->GetDeviceAccessor(device_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
    auto node_cats = dh::ToSpan(node_categories);

    row_partitioner->UpdatePosition(
@@ -384,7 +390,8 @@ struct GPUHistMakerDevice {
  // After tree update is finished, update the position of all training
  // instances to their final leaf. This information is used later to update the
  // prediction cache
-  void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat) {
+  void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
+                        HostDeviceVector<bst_node_t>* p_out_position) {
    dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
    dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                  d_nodes.size() * sizeof(RegTree::Node),
@@ -405,17 +412,21 @@ struct GPUHistMakerDevice {

    if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
      row_partitioner.reset();  // Release the device memory first before reallocating
-      row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
+      row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
+    }
+    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
+      // see comment in the `FinalisePositionInPage`.
+      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
    }
    if (page->n_rows == p_fmat->Info().num_row_) {
-      FinalisePositionInPage(page, dh::ToSpan(d_nodes),
-                             dh::ToSpan(d_split_types), dh::ToSpan(d_categories),
-                             dh::ToSpan(d_categories_segments));
+      FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                             dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
+                             p_out_position);
    } else {
-      for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
-        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes),
-                               dh::ToSpan(d_split_types), dh::ToSpan(d_categories),
-                               dh::ToSpan(d_categories_segments));
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
+        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                               dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
+                               p_out_position);
      }
    }
  }
@@ -424,9 +435,13 @@ struct GPUHistMakerDevice {
                              const common::Span<RegTree::Node> d_nodes,
                              common::Span<FeatureType const> d_feature_types,
                              common::Span<uint32_t const> categories,
-                              common::Span<RegTree::Segment> categories_segments) {
-    auto d_matrix = page->GetDeviceAccessor(device_id);
+                              common::Span<RegTree::Segment> categories_segments,
+                              ObjInfo task,
+                              HostDeviceVector<bst_node_t>* p_out_position) {
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_gpair = this->gpair;
    row_partitioner->FinalisePosition(
+        ctx_, task, p_out_position,
        [=] __device__(size_t row_id, int position) {
          // What happens if user prune the tree?
          if (!d_matrix.IsInRange(row_id)) {
@@ -457,13 +472,20 @@ struct GPUHistMakerDevice {
            }
            node = d_nodes[position];
          }
+
          return position;
+        },
+        [d_gpair] __device__(size_t ridx) {
+          // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+          // the sampler compacts the gradient vector.
+          return d_gpair[ridx].GetHess() - .0f == 0.f;
        });
  }

-  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d) {
-    dh::safe_cuda(cudaSetDevice(device_id));
-    CHECK_EQ(out_preds_d.DeviceIdx(), device_id);
+  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+    CHECK(p_tree);
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
    auto d_ridx = row_partitioner->GetRows();

    GPUTrainingParam param_d(param);
@@ -476,12 +498,15 @@ struct GPUHistMakerDevice {
    auto d_node_sum_gradients = device_node_sum_gradients.data().get();
    auto tree_evaluator = evaluator_.GetEvaluator();

-    dh::LaunchN(d_ridx.size(), [=, out_preds_d = out_preds_d] __device__(int local_idx) mutable {
-      int pos = d_position[local_idx];
-      bst_float weight =
-          tree_evaluator.CalcWeight(pos, param_d, GradStats{d_node_sum_gradients[pos]});
-      static_assert(!std::is_const<decltype(out_preds_d)>::value, "");
-      out_preds_d(d_ridx[local_idx]) += weight * param_d.learning_rate;
+    auto const& h_nodes = p_tree->GetNodes();
+    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
+    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
+                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
+    auto d_nodes = dh::ToSpan(nodes);
+    dh::LaunchN(d_ridx.size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
+      bst_node_t nidx = d_position[idx];
+      auto weight = d_nodes[nidx].LeafValue();
+      out_preds_d(d_ridx[idx]) += weight;
    });
    row_partitioner.reset();
  }
@@ -610,7 +635,8 @@ struct GPUHistMakerDevice {
  }

  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
-                  RegTree* p_tree, dh::AllReducer* reducer) {
+                  RegTree* p_tree, dh::AllReducer* reducer,
+                  HostDeviceVector<bst_node_t>* p_out_position) {
    auto& tree = *p_tree;
    Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));

@@ -641,7 +667,7 @@ struct GPUHistMakerDevice {

        int left_child_nidx = tree[candidate.nid].LeftChild();
        int right_child_nidx = tree[candidate.nid].RightChild();
-        // Only create child entries if needed
+        // Only create child entries if needed_
        if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
                                         num_leaves)) {
          monitor.Start("UpdatePosition");
@@ -671,7 +697,7 @@ struct GPUHistMakerDevice {
    }

    monitor.Start("FinalisePosition");
-    this->FinalisePosition(p_tree, p_fmat);
+    this->FinalisePosition(p_tree, p_fmat, task, p_out_position);
    monitor.Stop("FinalisePosition");
  }
 };
@@ -682,7 +708,7 @@ class GPUHistMakerSpecialised {
  explicit GPUHistMakerSpecialised(ObjInfo task) : task_{task} {};
  void Configure(const Args& args, GenericParameter const* generic_param) {
    param_.UpdateAllowUnknown(args);
-    generic_param_ = generic_param;
+    ctx_ = generic_param;
    hist_maker_param_.UpdateAllowUnknown(args);
    dh::CheckComputeCapability();

@@ -694,20 +720,24 @@ class GPUHistMakerSpecialised {
  }

  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) {
    monitor_.Start("Update");

    // rescale learning rate according to size of trees
    float lr = param_.learning_rate;
    param_.learning_rate = lr / trees.size();
+
    // build tree
    try {
+      size_t t_idx{0};
      for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(gpair, dmat, tree);
+        this->UpdateTree(gpair, dmat, tree, &out_position[t_idx]);

        if (hist_maker_param_.debug_synchronize) {
          this->CheckTreesSynchronized(tree);
        }
+        ++t_idx;
      }
      dh::safe_cuda(cudaGetLastError());
    } catch (const std::exception& e) {
@@ -719,41 +749,36 @@ class GPUHistMakerSpecialised {
  }

  void InitDataOnce(DMatrix* dmat) {
-    device_ = generic_param_->gpu_id;
-    CHECK_GE(device_, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
    info_ = &dmat->Info();
-    reducer_.Init({device_});  // NOLINT
+    reducer_.Init({ctx_->gpu_id});  // NOLINT

    // Synchronise the column sampling seed
    uint32_t column_sampling_seed = common::GlobalRandom()();
    rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);

    BatchParam batch_param{
-      device_,
+      ctx_->gpu_id,
      param_.max_bin,
    };
    auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
-    dh::safe_cuda(cudaSetDevice(device_));
-    info_->feature_types.SetDevice(device_);
-    maker.reset(new GPUHistMakerDevice<GradientSumT>(device_,
-                                                     page,
-                                                     info_->feature_types.ConstDeviceSpan(),
-                                                     info_->num_row_,
-                                                     param_,
-                                                     column_sampling_seed,
-                                                     info_->num_col_,
-                                                     batch_param));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    info_->feature_types.SetDevice(ctx_->gpu_id);
+    maker.reset(new GPUHistMakerDevice<GradientSumT>(
+        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_,
+        column_sampling_seed, info_->num_col_, batch_param));

    p_last_fmat_ = dmat;
    initialised_ = true;
  }

-  void InitData(DMatrix* dmat) {
+  void InitData(DMatrix* dmat, RegTree const* p_tree) {
    if (!initialised_) {
      monitor_.Start("InitDataOnce");
      this->InitDataOnce(dmat);
      monitor_.Stop("InitDataOnce");
    }
+    p_last_tree_ = p_tree;
  }

  // Only call this method for testing
@@ -771,13 +796,14 @@ class GPUHistMakerSpecialised {
    CHECK(*local_tree == reference_tree);
  }

-  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree) {
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
+                  HostDeviceVector<bst_node_t>* p_out_position) {
    monitor_.Start("InitData");
-    this->InitData(p_fmat);
+    this->InitData(p_fmat, p_tree);
    monitor_.Stop("InitData");

-    gpair->SetDevice(device_);
-    maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_);
+    gpair->SetDevice(ctx_->gpu_id);
+    maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_, p_out_position);
  }

  bool UpdatePredictionCache(const DMatrix *data,
@@ -786,7 +812,7 @@ class GPUHistMakerSpecialised {
      return false;
    }
    monitor_.Start("UpdatePredictionCache");
-    maker->UpdatePredictionCache(p_out_preds);
+    maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
    monitor_.Stop("UpdatePredictionCache");
    return true;
  }
@@ -800,12 +826,12 @@ class GPUHistMakerSpecialised {
  bool initialised_ { false };

  GPUHistMakerTrainParam hist_maker_param_;
-  GenericParameter const* generic_param_;
+  Context const* ctx_;

  dh::AllReducer reducer_;

  DMatrix* p_last_fmat_ { nullptr };
-  int device_{-1};
+  RegTree const* p_last_tree_{nullptr};
  ObjInfo task_;

  common::Monitor monitor_;
@@ -859,17 +885,17 @@ class GPUHistMaker : public TreeUpdater {
  }

  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override {
    if (hist_maker_param_.single_precision_histogram) {
-      float_maker_->Update(gpair, dmat, trees);
+      float_maker_->Update(gpair, dmat, out_position, trees);
    } else {
-      double_maker_->Update(gpair, dmat, trees);
+      double_maker_->Update(gpair, dmat, out_position, trees);
    }
  }

-  bool
-  UpdatePredictionCache(const DMatrix *data,
-                        linalg::VectorView<bst_float> p_out_preds) override {
+  bool UpdatePredictionCache(const DMatrix* data,
+                             linalg::VectorView<bst_float> p_out_preds) override {
    if (hist_maker_param_.single_precision_histogram) {
      return float_maker_->UpdatePredictionCache(data, p_out_preds);
    } else {
@@ -881,6 +907,8 @@ class GPUHistMaker : public TreeUpdater {
    return "grow_gpu_hist";
  }

+  bool HasNodePosition() const override { return true; }
+
 private:
  GPUHistMakerTrainParam hist_maker_param_;
  ObjInfo task_;
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -24,9 +24,9 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker);

 class HistMaker: public BaseMaker {
 public:
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
    interaction_constraints_.Configure(param_, p_fmat->Info().num_col_);
    // rescale learning rate according to size of trees
    float lr = param_.learning_rate;
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -50,9 +50,9 @@ class TreePruner: public TreeUpdater {
  }

  // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override {
    pruner_monitor_.Start("PrunerUpdate");
    // rescale learning rate according to size of trees
    float lr = param_.learning_rate;
@@ -61,7 +61,7 @@ class TreePruner: public TreeUpdater {
      this->DoPrune(tree);
    }
    param_.learning_rate = lr;
-    syncher_->Update(gpair, p_fmat, trees);
+    syncher_->Update(gpair, p_fmat, out_position, trees);
    pruner_monitor_.Stop("PrunerUpdate");
  }

--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -36,6 +36,7 @@ void QuantileHistMaker::Configure(const Args &args) {
 }

 void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+                               common::Span<HostDeviceVector<bst_node_t>> out_position,
                               const std::vector<RegTree *> &trees) {
  // rescale learning rate according to size of trees
  float lr = param_.learning_rate;
@@ -53,12 +54,15 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *d
    }
  }

+  size_t t_idx{0};
  for (auto p_tree : trees) {
+    auto &t_row_position = out_position[t_idx];
    if (hist_maker_param_.single_precision_histogram) {
-      this->float_builder_->UpdateTree(gpair, dmat, p_tree);
+      this->float_builder_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
    } else {
-      this->double_builder_->UpdateTree(gpair, dmat, p_tree);
+      this->double_builder_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
    }
+    ++t_idx;
  }

  param_.learning_rate = lr;
@@ -169,13 +173,29 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildHistogram(
  }
 }

+template <typename GradientSumT>
+void QuantileHistMaker::Builder<GradientSumT>::LeafPartition(
+    RegTree const &tree, common::Span<GradientPair const> gpair,
+    std::vector<bst_node_t> *p_out_position) {
+  monitor_->Start(__func__);
+  if (!evaluator_->Task().UpdateTreeLeaf()) {
+    return;
+  }
+  for (auto const &part : partitioner_) {
+    part.LeafPartition(ctx_, tree, gpair, p_out_position);
+  }
+  monitor_->Stop(__func__);
+}
+
 template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
-    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
+    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h,
+    HostDeviceVector<bst_node_t> *p_out_position) {
  monitor_->Start(__func__);

  Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
  driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
+  auto const &tree = *p_tree;
  bst_node_t num_leaves{1};
  auto expand_set = driver.Pop();

@@ -208,7 +228,6 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
    std::vector<CPUExpandEntry> best_splits;
    if (!valid_candidates.empty()) {
      this->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair_h);
-      auto const &tree = *p_tree;
      for (auto const &candidate : valid_candidates) {
        int left_child_nidx = tree[candidate.nid].LeftChild();
        int right_child_nidx = tree[candidate.nid].RightChild();
@@ -228,12 +247,15 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
    expand_set = driver.Pop();
  }

+  auto &h_out_position = p_out_position->HostVector();
+  this->LeafPartition(tree, gpair_h, &h_out_position);
  monitor_->Stop(__func__);
 }

 template <typename GradientSumT>
-void QuantileHistMaker::Builder<GradientSumT>::UpdateTree(HostDeviceVector<GradientPair> *gpair,
-                                                          DMatrix *p_fmat, RegTree *p_tree) {
+void QuantileHistMaker::Builder<GradientSumT>::UpdateTree(
+    HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat, RegTree *p_tree,
+    HostDeviceVector<bst_node_t> *p_out_position) {
  monitor_->Start(__func__);

  std::vector<GradientPair> *gpair_ptr = &(gpair->HostVector());
@@ -246,8 +268,7 @@ void QuantileHistMaker::Builder<GradientSumT>::UpdateTree(HostDeviceVector<Gradi

  this->InitData(p_fmat, *p_tree, gpair_ptr);

-  ExpandTree(p_fmat, p_tree, *gpair_ptr);
-
+  ExpandTree(p_fmat, p_tree, *gpair_ptr, p_out_position);
  monitor_->Stop(__func__);
 }

--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>

+#include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"

@@ -214,6 +215,15 @@ class HistRowPartitioner {
  size_t Size() const {
    return std::distance(row_set_collection_.begin(), row_set_collection_.end());
  }
+
+  void LeafPartition(Context const* ctx, RegTree const& tree,
+                     common::Span<GradientPair const> gpair,
+                     std::vector<bst_node_t>* p_out_position) const {
+    partition_builder_.LeafPartition(
+        ctx, tree, this->Partitions(), p_out_position,
+        [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
+  }
+
  auto& operator[](bst_node_t nidx) { return row_set_collection_[nidx]; }
  auto const& operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; }
 };
@@ -228,8 +238,8 @@ class QuantileHistMaker: public TreeUpdater {
  explicit QuantileHistMaker(ObjInfo task) : task_{task} {}
  void Configure(const Args& args) override;

-  void Update(HostDeviceVector<GradientPair>* gpair,
-              DMatrix* dmat,
+  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override;

  bool UpdatePredictionCache(const DMatrix *data,
@@ -266,6 +276,8 @@ class QuantileHistMaker: public TreeUpdater {
    return "grow_quantile_histmaker";
  }

+  bool HasNodePosition() const override { return true; }
+
 protected:
  CPUHistMakerTrainParam hist_maker_param_;
  // training parameter
@@ -289,7 +301,8 @@ class QuantileHistMaker: public TreeUpdater {
      monitor_->Init("Quantile::Builder");
    }
    // update one tree, growing
-    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree);
+    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
+                    HostDeviceVector<bst_node_t>* p_out_position);

    bool UpdatePredictionCache(DMatrix const* data, linalg::VectorView<float> out_preds) const;

@@ -308,7 +321,11 @@ class QuantileHistMaker: public TreeUpdater {
                        std::vector<CPUExpandEntry> const& valid_candidates,
                        std::vector<GradientPair> const& gpair);

-    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h);
+    void LeafPartition(RegTree const& tree, common::Span<GradientPair const> gpair,
+                       std::vector<bst_node_t>* p_out_position);
+
+    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h,
+                    HostDeviceVector<bst_node_t>* p_out_position);

   private:
    const size_t n_trees_;
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -42,9 +42,9 @@ class TreeRefresher: public TreeUpdater {
    return true;
  }
  // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
    if (trees.size() == 0) return;
    const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
    // thread temporal space
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -31,9 +31,9 @@ class TreeSyncher: public TreeUpdater {
    return "prune";
  }

-  void Update(HostDeviceVector<GradientPair>* ,
-              DMatrix*,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair>*, DMatrix*,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override {
    if (rabit::GetWorldSize() == 1) return;
    std::string s_model;
    common::MemoryBufferStream fs(&s_model);