Merge branch 'master' into dev-hui

2023-03-08 00:39:33 +01:00
parent f286ae5bfa f236640427
commit ed45aa2816
221 changed files with 3122 additions and 1486 deletions
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -3,27 +3,34 @@
 */
 #include "adaptive.h"

-#include <limits>
-#include <vector>
+#include <algorithm>                       // std::transform,std::find_if,std::copy,std::unique
+#include <cmath>                           // std::isnan
+#include <cstddef>                         // std::size_t
+#include <iterator>                        // std::distance
+#include <vector>                          // std::vector

-#include "../common/common.h"
-#include "../common/numeric.h"
-#include "../common/stats.h"
-#include "../common/threading_utils.h"
+#include "../common/algorithm.h"           // ArgSort
+#include "../common/common.h"              // AssertGPUSupport
+#include "../common/numeric.h"             // RunLengthEncode
+#include "../common/stats.h"               // Quantile,WeightedQuantile
+#include "../common/threading_utils.h"     // ParallelFor
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
-#include "xgboost/linalg.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/base.h"                  // bst_node_t
+#include "xgboost/context.h"               // Context
+#include "xgboost/data.h"                  // MetaInfo
+#include "xgboost/host_device_vector.h"    // HostDeviceVector
+#include "xgboost/linalg.h"                // MakeTensorView
+#include "xgboost/span.h"                  // Span
+#include "xgboost/tree_model.h"            // RegTree

-namespace xgboost {
-namespace obj {
-namespace detail {
-void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& position,
-                        std::vector<size_t>* p_nptr, std::vector<bst_node_t>* p_nidx,
-                        std::vector<size_t>* p_ridx) {
+namespace xgboost::obj::detail {
+void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
+                        std::vector<bst_node_t> const& position, std::vector<size_t>* p_nptr,
+                        std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_ridx) {
  auto& nptr = *p_nptr;
  auto& nidx = *p_nidx;
  auto& ridx = *p_ridx;
-  ridx = common::ArgSort<size_t>(position);
+  ridx = common::ArgSort<size_t>(ctx, position.cbegin(), position.cend());
  std::vector<bst_node_t> sorted_pos(position);
  // permutation
  for (size_t i = 0; i < position.size(); ++i) {
@@ -67,18 +74,18 @@ void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& posi
 }

 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info,
+                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                        HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
  auto& tree = *p_tree;

  std::vector<bst_node_t> nidx;
  std::vector<size_t> nptr;
  std::vector<size_t> ridx;
-  EncodeTreeLeafHost(*p_tree, position, &nptr, &nidx, &ridx);
+  EncodeTreeLeafHost(ctx, *p_tree, position, &nptr, &nidx, &ridx);
  size_t n_leaf = nidx.size();
  if (nptr.empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, p_tree);
+    UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
    return;
  }

@@ -89,8 +96,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
  auto const& h_node_idx = nidx;
  auto const& h_node_ptr = nptr;
  CHECK_LE(h_node_ptr.back(), info.num_row_);
-  auto h_predt = linalg::MakeTensorView(predt.ConstHostSpan(),
-                                        {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
+  auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
+                                        predt.Size() / info.num_row_);

  // loop over each leaf
  common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
@@ -99,8 +106,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
    CHECK_LT(k + 1, h_node_ptr.size());
    size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
    auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
-    CHECK_LE(group_idx, info.labels.Shape(1));
-    auto h_labels = info.labels.HostView().Slice(linalg::All(), group_idx);
+
+    auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
    auto h_weights = linalg::MakeVec(&info.weights_);

    auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
@@ -114,9 +121,9 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit

    float q{0};
    if (info.weights_.Empty()) {
-      q = common::Quantile(alpha, iter, iter + h_row_set.size());
+      q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
    } else {
-      q = common::WeightedQuantile(alpha, iter, iter + h_row_set.size(), w_it);
+      q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
    }
    if (std::isnan(q)) {
      CHECK(h_row_set.empty());
@@ -124,8 +131,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
    quantiles.at(k) = q;
  });

-  UpdateLeafValues(&quantiles, nidx, p_tree);
+  UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
 }
-}  // namespace detail
-}  // namespace obj
-}  // namespace xgboost
+
+#if !defined(XGBOOST_USE_CUDA)
+void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, std::int32_t,
+                          MetaInfo const&, float, HostDeviceVector<float> const&, float, RegTree*) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::obj::detail
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -3,8 +3,8 @@
 */
 #include <thrust/sort.h>

-#include <cstdint>  // std::int32_t
-#include <cub/cub.cuh>
+#include <cstdint>                     // std::int32_t
+#include <cub/cub.cuh>                 // NOLINT

 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
@@ -20,20 +20,19 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                          HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
  // copy position to buffer
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  auto cuctx = ctx->CUDACtx();
  size_t n_samples = position.size();
-  dh::XGBDeviceAllocator<char> alloc;
  dh::device_vector<bst_node_t> sorted_position(position.size());
  dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
-                                position.size_bytes(), cudaMemcpyDeviceToDevice));
+                                position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));

  p_ridx->resize(position.size());
  dh::Iota(dh::ToSpan(*p_ridx));
  // sort row index according to node index
-  thrust::stable_sort_by_key(thrust::cuda::par(alloc), sorted_position.begin(),
+  thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
                             sorted_position.begin() + n_samples, p_ridx->begin());
-  dh::XGBCachingDeviceAllocator<char> caching;
  size_t beg_pos =
-      thrust::find_if(thrust::cuda::par(caching), sorted_position.cbegin(), sorted_position.cend(),
+      thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
                      [] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
      sorted_position.cbegin();
  if (beg_pos == sorted_position.size()) {
@@ -72,7 +71,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
  size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());

  dh::CUDAEvent e;
-  e.Record(dh::DefaultStream());
+  e.Record(cuctx->Stream());
  copy_stream.View().Wait(e);
  // flag for whether there's ignored position
  bst_node_t* h_first_unique =
@@ -108,7 +107,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
      d_node_ptr[0] = beg_pos;
    }
  });
-  thrust::inclusive_scan(thrust::cuda::par(caching), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
+  thrust::inclusive_scan(cuctx->CTP(), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
                         dh::tbegin(d_node_ptr));
  copy_stream.View().Sync();
  CHECK_GT(*h_num_runs, 0);
@@ -141,7 +140,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 }

 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info,
+                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                          HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
  dh::device_vector<size_t> ridx;
@@ -152,17 +151,17 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos

  if (nptr.Empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree);
+    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), learning_rate, p_tree);
  }

  HostDeviceVector<float> quantiles;
  predt.SetDevice(ctx->gpu_id);

-  auto d_predt = linalg::MakeTensorView(predt.ConstDeviceSpan(),
-                                        {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
+  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
+                                        predt.Size() / info.num_row_);
  CHECK_LT(group_idx, d_predt.Shape(1));
  auto t_predt = d_predt.Slice(linalg::All(), group_idx);
-  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), group_idx);
+  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));

  auto d_row_index = dh::ToSpan(ridx);
  auto seg_beg = nptr.DevicePointer();
@@ -187,7 +186,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                      w_it + d_weights.size(), &quantiles);
  }

-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree);
+  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), learning_rate, p_tree);
 }
 }  // namespace detail
 }  // namespace obj
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -6,13 +6,15 @@
 #include <algorithm>
 #include <cstdint>  // std::int32_t
 #include <limits>
-#include <vector>
+#include <vector>  // std::vector

 #include "../collective/communicator-inl.h"
 #include "../common/common.h"
-#include "xgboost/context.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/base.h"                // bst_node_t
+#include "xgboost/context.h"             // Context
+#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/tree_model.h"          // RegTree

 namespace xgboost {
 namespace obj {
@@ -34,7 +36,7 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
 }

 inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             RegTree* p_tree) {
+                             float learning_rate, RegTree* p_tree) {
  auto& tree = *p_tree;
  auto& quantiles = *p_quantiles;
  auto const& h_node_idx = nidx;
@@ -69,17 +71,39 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
    auto nidx = h_node_idx[i];
    auto q = quantiles[i];
    CHECK(tree[nidx].IsLeaf());
-    tree[nidx].SetLeaf(q);
+    tree[nidx].SetLeaf(q * learning_rate);
  }
 }

+inline std::size_t IdxY(MetaInfo const& info, bst_group_t group_idx) {
+  std::size_t y_idx{0};
+  if (info.labels.Shape(1) > 1) {
+    y_idx = group_idx;
+  }
+  CHECK_LE(y_idx, info.labels.Shape(1));
+  return y_idx;
+}
+
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info,
+                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                          HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);

 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info,
+                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                        HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 }  // namespace detail
+
+inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
+                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
+  if (ctx->IsCPU()) {
+    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
+                               predt, alpha, p_tree);
+  } else {
+    position.SetDevice(ctx->gpu_id);
+    detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
+                                 predt, alpha, p_tree);
+  }
+}
 }  // namespace obj
 }  // namespace xgboost
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors
+ */
+#include "init_estimation.h"
+
+#include <memory>                        // unique_ptr
+
+#include "../common/stats.h"             // Mean
+#include "../tree/fit_stump.h"           // FitStump
+#include "xgboost/base.h"                // GradientPair
+#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/json.h"                // Json
+#include "xgboost/linalg.h"              // Tensor,Vector
+#include "xgboost/task.h"                // ObjInfo
+
+namespace xgboost {
+namespace obj {
+void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
+  if (this->Task().task == ObjInfo::kRegression) {
+    CheckInitInputs(info);
+  }
+  // Avoid altering any state in child objective.
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
+  HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
+
+  Json config{Object{}};
+  this->SaveConfig(&config);
+
+  std::unique_ptr<ObjFunction> new_obj{
+      ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
+  new_obj->LoadConfig(config);
+  new_obj->GetGradient(dummy_predt, info, 0, &gpair);
+  bst_target_t n_targets = this->Targets(info);
+  linalg::Vector<float> leaf_weight;
+  tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
+
+  // workaround, we don't support multi-target due to binary model serialization for
+  // base margin.
+  common::Mean(this->ctx_, leaf_weight, base_score);
+  this->PredTransform(base_score->Data());
+}
+}  // namespace obj
+}  // namespace xgboost
--- a/src/objective/init_estimation.h
+++ b/src/objective/init_estimation.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
+#define XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
+#include "xgboost/data.h"       // MetaInfo
+#include "xgboost/linalg.h"     // Tensor
+#include "xgboost/objective.h"  // ObjFunction
+
+namespace xgboost {
+namespace obj {
+class FitIntercept : public ObjFunction {
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override;
+};
+
+inline void CheckInitInputs(MetaInfo const& info) {
+  CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
+  if (!info.weights_.Empty()) {
+    CHECK_EQ(info.weights_.Size(), info.num_row_)
+        << "Number of weights should be equal to number of data points.";
+  }
+}
+}  // namespace obj
+}  // namespace xgboost
+#endif  // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -44,11 +44,13 @@ namespace obj {
 // List of files that will be force linked in static links.
 #ifdef XGBOOST_USE_CUDA
 DMLC_REGISTRY_LINK_TAG(regression_obj_gpu);
+DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
 #else
 DMLC_REGISTRY_LINK_TAG(regression_obj);
+DMLC_REGISTRY_LINK_TAG(quantile_obj);
 DMLC_REGISTRY_LINK_TAG(hinge_obj);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj);
 DMLC_REGISTRY_LINK_TAG(rank_obj);
--- a/src/objective/quantile_obj.cc
+++ b/src/objective/quantile_obj.cc
@@ -0,0 +1,18 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+
+// Dummy file to enable the CUDA conditional compile trick.
+
+#include <dmlc/registry.h>
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(quantile_obj);
+
+}  // namespace obj
+}  // namespace xgboost
+
+#ifndef XGBOOST_USE_CUDA
+#include "quantile_obj.cu"
+#endif  // !defined(XBGOOST_USE_CUDA)
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -0,0 +1,222 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include <cstddef>                          // std::size_t
+#include <cstdint>                          // std::int32_t
+#include <vector>                           // std::vector
+
+#include "../common/linalg_op.h"            // ElementWiseKernel,cbegin,cend
+#include "../common/quantile_loss_utils.h"  // QuantileLossParam
+#include "../common/stats.h"                // Quantile,WeightedQuantile
+#include "adaptive.h"                       // UpdateTreeLeaf
+#include "dmlc/parameter.h"                 // DMLC_DECLARE_PARAMETER
+#include "init_estimation.h"                // CheckInitInputs
+#include "xgboost/base.h"                   // GradientPair,XGBOOST_DEVICE,bst_target_t
+#include "xgboost/data.h"                   // MetaInfo
+#include "xgboost/host_device_vector.h"     // HostDeviceVector
+#include "xgboost/json.h"                   // Json,String,ToJson,FromJson
+#include "xgboost/linalg.h"                 // Tensor,MakeTensorView,MakeVec
+#include "xgboost/objective.h"              // ObjFunction
+#include "xgboost/parameter.h"              // XGBoostParameter
+
+#if defined(XGBOOST_USE_CUDA)
+
+#include "../common/linalg_op.cuh"  // ElementWiseKernel
+#include "../common/stats.cuh"      // SegmentedQuantile
+
+#endif                              // defined(XGBOOST_USE_CUDA)
+
+namespace xgboost {
+namespace obj {
+class QuantileRegression : public ObjFunction {
+  common::QuantileLossParam param_;
+  HostDeviceVector<float> alpha_;
+
+  bst_target_t Targets(MetaInfo const& info) const override {
+    auto const& alpha = param_.quantile_alpha.Get();
+    CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
+    CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target is not yet supported by the quantile loss.";
+    CHECK(!alpha.empty());
+    // We have some placeholders for multi-target in the quantile loss. But it's not
+    // supported as the gbtree doesn't know how to slice the gradient and there's no 3-dim
+    // model shape in general.
+    auto n_y = std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
+    return alpha_.Size() * n_y;
+  }
+
+ public:
+  void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, std::int32_t iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    if (iter == 0) {
+      CheckInitInputs(info);
+    }
+    CHECK_EQ(param_.quantile_alpha.Get().size(), alpha_.Size());
+
+    using SizeT = decltype(info.num_row_);
+    SizeT n_targets = this->Targets(info);
+    SizeT n_alphas = alpha_.Size();
+    CHECK_NE(n_alphas, 0);
+    CHECK_GE(n_targets, n_alphas);
+    CHECK_EQ(preds.Size(), info.num_row_ * n_targets);
+
+    auto labels = info.labels.View(ctx_->gpu_id);
+
+    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->Resize(n_targets * info.num_row_);
+    auto gpair =
+        linalg::MakeTensorView(ctx_, out_gpair, info.num_row_, n_alphas, n_targets / n_alphas);
+
+    info.weights_.SetDevice(ctx_->gpu_id);
+    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
+                                                 : info.weights_.ConstDeviceSpan()};
+
+    preds.SetDevice(ctx_->gpu_id);
+    auto predt = linalg::MakeVec(&preds);
+    auto n_samples = info.num_row_;
+
+    alpha_.SetDevice(ctx_->gpu_id);
+    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
+
+    linalg::ElementWiseKernel(
+        ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
+          auto [sample_id, quantile_id, target_id] =
+              linalg::UnravelIndex(i, n_samples, alpha.size(), n_targets / alpha.size());
+
+          auto d = predt(i) - labels(sample_id, target_id);
+          auto h = weight[sample_id];
+          if (d >= 0) {
+            auto g = (1.0f - alpha[quantile_id]) * weight[sample_id];
+            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+          } else {
+            auto g = (-alpha[quantile_id] * weight[sample_id]);
+            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+          }
+        });
+  }
+
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override {
+    CHECK(!alpha_.Empty());
+
+    auto n_targets = this->Targets(info);
+    base_score->SetDevice(ctx_->gpu_id);
+    base_score->Reshape(n_targets);
+
+    double sw{0};
+    if (ctx_->IsCPU()) {
+      auto quantiles = base_score->HostView();
+      auto h_weights = info.weights_.ConstHostVector();
+      if (info.weights_.Empty()) {
+        sw = info.num_row_;
+      } else {
+        sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
+      }
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto alpha = param_.quantile_alpha[t];
+        auto h_labels = info.labels.HostView();
+        if (h_weights.empty()) {
+          quantiles(t) =
+              common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels));
+        } else {
+          CHECK_EQ(h_weights.size(), h_labels.Size());
+          quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels),
+                                                  linalg::cend(h_labels), std::cbegin(h_weights));
+        }
+      }
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      alpha_.SetDevice(ctx_->gpu_id);
+      auto d_alpha = alpha_.ConstDeviceSpan();
+      auto d_labels = info.labels.View(ctx_->gpu_id);
+      auto seg_it = dh::MakeTransformIterator<std::size_t>(
+          thrust::make_counting_iterator(0ul),
+          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
+      CHECK_EQ(d_labels.Shape(1), 1);
+      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                     [=] XGBOOST_DEVICE(std::size_t i) {
+                                                       auto sample_idx = i % d_labels.Shape(0);
+                                                       return d_labels(sample_idx, 0);
+                                                     });
+      auto n = d_labels.Size() * d_alpha.size();
+      CHECK_EQ(base_score->Size(), d_alpha.size());
+      if (info.weights_.Empty()) {
+        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
+                                  val_it + n, base_score->Data());
+        sw = info.num_row_;
+      } else {
+        info.weights_.SetDevice(ctx_->gpu_id);
+        auto d_weights = info.weights_.ConstDeviceSpan();
+        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                          [=] XGBOOST_DEVICE(std::size_t i) {
+                                                            auto sample_idx = i % d_labels.Shape(0);
+                                                            return d_weights[sample_idx];
+                                                          });
+        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
+                                          val_it, val_it + n, weight_it, weight_it + n,
+                                          base_score->Data());
+        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
+                        thrust::plus<double>{});
+      }
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
+    }
+
+    // For multiple quantiles, we should extend the base score to a vector instead of
+    // computing the average. For now, this is a workaround.
+    linalg::Vector<float> temp;
+    common::Mean(ctx_, *base_score, &temp);
+    double meanq = temp(0) * sw;
+
+    collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
+    collective::Allreduce<collective::Operation::kSum>(&sw, 1);
+    meanq /= (sw + kRtEps);
+    base_score->Reshape(1);
+    base_score->Data()->Fill(meanq);
+  }
+
+  void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
+                      float learning_rate, HostDeviceVector<float> const& prediction,
+                      std::int32_t group_idx, RegTree* p_tree) const override {
+    auto alpha = param_.quantile_alpha[group_idx];
+    ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction,
+                                   alpha, p_tree);
+  }
+
+  void Configure(Args const& args) override {
+    param_.UpdateAllowUnknown(args);
+    param_.Validate();
+    this->alpha_.HostVector() = param_.quantile_alpha.Get();
+  }
+  ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
+  static char const* Name() { return "reg:quantileerror"; }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Name());
+    out["quantile_loss_param"] = ToJson(param_);
+  }
+  void LoadConfig(Json const& in) override {
+    CHECK_EQ(get<String const>(in["name"]), Name());
+    FromJson(in["quantile_loss_param"], &param_);
+    alpha_.HostVector() = param_.quantile_alpha.Get();
+  }
+
+  const char* DefaultEvalMetric() const override { return "quantile"; }
+  Json DefaultMetricConfig() const override {
+    CHECK(param_.GetInitialised());
+    Json config{Object{}};
+    config["name"] = String{this->DefaultEvalMetric()};
+    config["quantile_loss_param"] = ToJson(param_);
+    return config;
+  }
+};
+
+XGBOOST_REGISTER_OBJECTIVE(QuantileRegression, QuantileRegression::Name())
+    .describe("Regression with quantile loss.")
+    .set_body([]() { return new QuantileRegression(); });
+
+#if defined(XGBOOST_USE_CUDA)
+DMLC_REGISTRY_FILE_TAG(quantile_obj_gpu);
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace obj
+}  // namespace xgboost
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -1,15 +1,16 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
 */
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_

 #include <dmlc/omp.h>
-#include <xgboost/logging.h>

 #include <cmath>

 #include "../common/math.h"
+#include "xgboost/data.h"  // MetaInfo
+#include "xgboost/logging.h"
 #include "xgboost/task.h"  // ObjInfo

 namespace xgboost {
@@ -105,7 +106,6 @@ struct LogisticRaw : public LogisticRegression {

  static ObjInfo Info() { return ObjInfo::kRegression; }
 };
-
 }  // namespace obj
 }  // namespace xgboost

--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -20,12 +20,12 @@
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
 #include "../common/transform.h"
-#include "../tree/fit_stump.h"  // FitStump
 #include "./regression_loss.h"
 #include "adaptive.h"
+#include "init_estimation.h"  // FitIntercept
 #include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/data.h"  // MetaInfo
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // MetaInfo
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/linalg.h"
@@ -43,45 +43,12 @@
 namespace xgboost {
 namespace obj {
 namespace {
-void CheckInitInputs(MetaInfo const& info) {
-  CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
-  if (!info.weights_.Empty()) {
-    CHECK_EQ(info.weights_.Size(), info.num_row_)
-        << "Number of weights should be equal to number of data points.";
-  }
-}
-
 void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
  CheckInitInputs(info);
  CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
 }
 }  // anonymous namespace

-class RegInitEstimation : public ObjFunction {
-  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) const override {
-    CheckInitInputs(info);
-    // Avoid altering any state in child objective.
-    HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-    HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
-
-    Json config{Object{}};
-    this->SaveConfig(&config);
-
-    std::unique_ptr<ObjFunction> new_obj{
-        ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
-    new_obj->LoadConfig(config);
-    new_obj->GetGradient(dummy_predt, info, 0, &gpair);
-    bst_target_t n_targets = this->Targets(info);
-    linalg::Vector<float> leaf_weight;
-    tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
-
-    // workaround, we don't support multi-target due to binary model serialization for
-    // base margin.
-    common::Mean(this->ctx_, leaf_weight, base_score);
-    this->PredTransform(base_score->Data());
-  }
-};
-
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(regression_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -96,7 +63,7 @@ struct RegLossParam : public XGBoostParameter<RegLossParam> {
 };

 template<typename Loss>
-class RegLossObj : public RegInitEstimation {
+class RegLossObj : public FitIntercept {
 protected:
  HostDeviceVector<float> additional_input_;

@@ -243,7 +210,7 @@ XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
    return new RegLossObj<LinearSquareLoss>(); });
 // End deprecated

-class PseudoHuberRegression : public RegInitEstimation {
+class PseudoHuberRegression : public FitIntercept {
  PesudoHuberParam param_;

 public:
@@ -318,7 +285,7 @@ struct PoissonRegressionParam : public XGBoostParameter<PoissonRegressionParam>
 };

 // poisson regression for count
-class PoissonRegression : public RegInitEstimation {
+class PoissonRegression : public FitIntercept {
 public:
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -413,7 +380,7 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")


 // cox regression for survival data (negative values mean they are censored)
-class CoxRegression : public RegInitEstimation {
+class CoxRegression : public FitIntercept {
 public:
  void Configure(Args const&) override {}
  ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -426,7 +393,7 @@ class CoxRegression : public RegInitEstimation {
    const auto& preds_h = preds.HostVector();
    out_gpair->Resize(preds_h.size());
    auto& gpair = out_gpair->HostVector();
-    const std::vector<size_t> &label_order = info.LabelAbsSort();
+    const std::vector<size_t> &label_order = info.LabelAbsSort(ctx_);

    const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
    const bool is_null_weight = info.weights_.Size() == 0;
@@ -510,7 +477,7 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 .set_body([]() { return new CoxRegression(); });

 // gamma regression
-class GammaRegression : public RegInitEstimation {
+class GammaRegression : public FitIntercept {
 public:
  void Configure(Args const&) override {}
  ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -601,7 +568,7 @@ struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam>
 };

 // tweedie regression
-class TweedieRegression : public RegInitEstimation {
+class TweedieRegression : public FitIntercept {
 public:
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -775,20 +742,10 @@ class MeanAbsoluteError : public ObjFunction {
  }

  void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
-                      HostDeviceVector<float> const& prediction, std::int32_t group_idx,
-                      RegTree* p_tree) const override {
-    if (ctx_->IsCPU()) {
-      auto const& h_position = position.ConstHostVector();
-      detail::UpdateTreeLeafHost(ctx_, h_position, group_idx, info, prediction, 0.5, p_tree);
-    } else {
-#if defined(XGBOOST_USE_CUDA)
-      position.SetDevice(ctx_->gpu_id);
-      auto d_position = position.ConstDeviceSpan();
-      detail::UpdateTreeLeafDevice(ctx_, d_position, group_idx, info, prediction, 0.5, p_tree);
-#else
-      common::AssertGPUSupport();
-#endif  //  defined(XGBOOST_USE_CUDA)
-    }
+                      float learning_rate, HostDeviceVector<float> const& prediction,
+                      std::int32_t group_idx, RegTree* p_tree) const override {
+    ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
+                                   p_tree);
  }

  const char* DefaultEvalMetric() const override { return "mae"; }