Use matrix for gradient. (#9508)

- Use the `linalg::Matrix` for storing gradients. - New API for the custom objective. - Custom objective for multi-class/multi-target is now required to return the correct shape. - Custom objective for Python can accept arrays with any strides. (row-major, column-major)
2023-08-24 05:29:52 +08:00
parent 6103dca0bb
commit 972730cde0
77 changed files with 1052 additions and 651 deletions
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -29,7 +29,6 @@
 #include "../common/error_msg.h"

 namespace xgboost::gbm {
-
 DMLC_REGISTRY_FILE_TAG(gblinear);

 // training parameters
@@ -142,7 +141,7 @@ class GBLinear : public GradientBooster {
    this->updater_->SaveConfig(&j_updater);
  }

-  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair, PredictionCacheEntry*,
+  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry*,
               ObjFunction const*) override {
    monitor_.Start("DoBoost");

@@ -232,9 +231,8 @@ class GBLinear : public GradientBooster {
    std::fill(contribs.begin(), contribs.end(), 0);
  }

-  std::vector<std::string> DumpModel(const FeatureMap& fmap,
-                                     bool with_stats,
-                                     std::string format) const override {
+  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+                                                   std::string format) const override {
    return model_.DumpModel(fmap, with_stats, format);
  }

@@ -263,7 +261,7 @@ class GBLinear : public GradientBooster {
    }
  }

-  bool UseGPU() const override {
+  [[nodiscard]] bool UseGPU() const override {
    if (param_.updater == "gpu_coord_descent") {
      return true;
    } else {
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -167,8 +167,8 @@ void GBTree::Configure(Args const& cfg) {
  }
 }

-void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_group_t,
-                     HostDeviceVector<GradientPair>*)
+void GPUCopyGradient(Context const*, linalg::Matrix<GradientPair> const*, bst_group_t,
+                     linalg::Matrix<GradientPair>*)
 #if defined(XGBOOST_USE_CUDA)
    ;  // NOLINT
 #else
@@ -177,16 +177,19 @@ void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_gro
 }
 #endif

-void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_threads,
-                  bst_group_t n_groups, bst_group_t group_id,
-                  HostDeviceVector<GradientPair>* out_gpair) {
-  if (in_gpair->DeviceIdx() != Context::kCpuId) {
-    GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
+void CopyGradient(Context const* ctx, linalg::Matrix<GradientPair> const* in_gpair,
+                  bst_group_t group_id, linalg::Matrix<GradientPair>* out_gpair) {
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(in_gpair->Shape(0), 1);
+  if (ctx->IsCUDA()) {
+    GPUCopyGradient(ctx, in_gpair, group_id, out_gpair);
  } else {
-    std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
-    const auto& gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(out_gpair->Size(), n_threads,
-                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
+    auto const& in = *in_gpair;
+    auto target_gpair = in.Slice(linalg::All(), group_id);
+    auto h_tmp = out_gpair->HostView();
+    auto h_in = in.HostView().Slice(linalg::All(), group_id);
+    CHECK_EQ(h_tmp.Size(), h_in.Size());
+    common::ParallelFor(h_in.Size(), ctx->Threads(), [&](auto i) { h_tmp(i) = h_in(i); });
  }
 }

@@ -215,7 +218,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
  }
 }

-void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  if (model_.learner_model_param->IsVectorLeaf()) {
    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
@@ -263,12 +266,12 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
    }
  } else {
    CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
-    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / n_groups, GradientPair(),
-                                       in_gpair->DeviceIdx());
+    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
+                                     ctx_->Ordinal()};
    bool update_predict = true;
    for (bst_target_t gid = 0; gid < n_groups; ++gid) {
      node_position.clear();
-      CopyGradient(in_gpair, ctx_->Threads(), n_groups, gid, &tmp);
+      CopyGradient(ctx_, in_gpair, gid, &tmp);
      TreesOneGroup ret;
      BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
      UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
@@ -289,7 +292,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  this->CommitModel(std::move(new_trees));
 }

-void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
                           TreesOneGroup* ret) {
  std::vector<RegTree*> new_trees;
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,22 +1,24 @@
 /**
 * Copyright 2021-2023, XGBoost Contributors
 */
-#include "../common/device_helpers.cuh"
-#include "xgboost/linalg.h"
-#include "xgboost/span.h"
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+
+#include "../common/cuda_context.cuh"
+#include "../common/device_helpers.cuh"  // for MakeTransformIterator
+#include "xgboost/base.h"                // for GradientPair
+#include "xgboost/linalg.h"              // for Matrix

 namespace xgboost::gbm {
-void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
-                     bst_group_t n_groups, bst_group_t group_id,
-                     HostDeviceVector<GradientPair> *out_gpair) {
-  auto mat = linalg::TensorView<GradientPair const, 2>(
-      in_gpair->ConstDeviceSpan(),
-      {in_gpair->Size() / n_groups, static_cast<size_t>(n_groups)},
-      in_gpair->DeviceIdx());
-  auto v_in = mat.Slice(linalg::All(), group_id);
-  out_gpair->Resize(v_in.Size());
-  auto d_out = out_gpair->DeviceSpan();
-  dh::LaunchN(v_in.Size(), [=] __device__(size_t i) { d_out[i] = v_in(i); });
+void GPUCopyGradient(Context const *ctx, linalg::Matrix<GradientPair> const *in_gpair,
+                     bst_group_t group_id, linalg::Matrix<GradientPair> *out_gpair) {
+  auto v_in = in_gpair->View(ctx->Device()).Slice(linalg::All(), group_id);
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(v_in.Size(), 1);
+  auto d_out = out_gpair->View(ctx->Device());
+  auto cuctx = ctx->CUDACtx();
+  auto it = dh::MakeTransformIterator<GradientPair>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v_in(i); });
+  thrust::copy(cuctx->CTP(), it, it + v_in.Size(), d_out.Values().data());
 }

 void GPUDartPredictInc(common::Span<float> out_predts,
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -183,8 +183,8 @@ class GBTree : public GradientBooster {
  /**
   * @brief Carry out one iteration of boosting.
   */
-  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
-               PredictionCacheEntry* predt, ObjFunction const* obj) override;
+  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
+               ObjFunction const* obj) override;

  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }

@@ -326,7 +326,7 @@ class GBTree : public GradientBooster {
  }

 protected:
-  void BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+  void BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                     std::vector<HostDeviceVector<bst_node_t>>* out_position,
                     std::vector<std::unique_ptr<RegTree>>* ret);