Use matrix for gradient. (#9508)

- Use the `linalg::Matrix` for storing gradients. - New API for the custom objective. - Custom objective for multi-class/multi-target is now required to return the correct shape. - Custom objective for Python can accept arrays with any strides. (row-major, column-major)
2023-08-24 05:29:52 +08:00
parent 6103dca0bb
commit 972730cde0
77 changed files with 1052 additions and 651 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -22,6 +22,7 @@
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
 #include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
+#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
 #include "../data/ellpack_page.h"            // for EllpackPage
@@ -68,6 +69,7 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
  }
 }

+static_assert(DMLC_CXX11_THREAD_LOCAL, "XGBoost depends on thread-local storage.");
 using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;

 #if !defined(XGBOOST_USE_CUDA)
@@ -717,8 +719,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
  API_END();
 }

-XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
-                            xgboost::bst_ulong *out) {
+XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, xgboost::bst_ulong *out) {
  API_BEGIN();
  CHECK_HANDLE();
  auto p_m = CastDMatrixHandle(handle);
@@ -727,8 +728,7 @@ XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
  API_END();
 }

-XGB_DLL int XGDMatrixNumCol(const DMatrixHandle handle,
-                            xgboost::bst_ulong *out) {
+XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, xgboost::bst_ulong *out) {
  API_BEGIN();
  CHECK_HANDLE();
  auto p_m = CastDMatrixHandle(handle);
@@ -970,28 +970,71 @@ XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle,
  API_END();
 }

-XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
-                                  DMatrixHandle dtrain,
-                                  bst_float *grad,
-                                  bst_float *hess,
-                                  xgboost::bst_ulong len) {
+XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bst_float *grad,
+                                  bst_float *hess, xgboost::bst_ulong len) {
  API_BEGIN();
  CHECK_HANDLE();
-  HostDeviceVector<GradientPair> tmp_gpair;
-  auto* bst = static_cast<Learner*>(handle);
-  auto* dtr =
-      static_cast<std::shared_ptr<DMatrix>*>(dtrain);
-  tmp_gpair.Resize(len);
-  std::vector<GradientPair>& tmp_gpair_h = tmp_gpair.HostVector();
-  if (len > 0) {
-    xgboost_CHECK_C_ARG_PTR(grad);
-    xgboost_CHECK_C_ARG_PTR(hess);
-  }
-  for (xgboost::bst_ulong i = 0; i < len; ++i) {
-    tmp_gpair_h[i] = GradientPair(grad[i], hess[i]);
-  }
+  error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
+  auto *learner = static_cast<Learner *>(handle);
+  auto ctx = learner->Ctx()->MakeCPU();

-  bst->BoostOneIter(0, *dtr, &tmp_gpair);
+  auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
+  auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
+
+  auto s_grad = linalg::ArrayInterfaceStr(t_grad);
+  auto s_hess = linalg::ArrayInterfaceStr(t_hess);
+
+  return XGBoosterTrainOneIter(handle, dtrain, 0, s_grad.c_str(), s_hess.c_str());
+  API_END();
+}
+
+namespace xgboost {
+// copy user-supplied CUDA gradient arrays
+void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
+                                ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
+#if !defined(XGBOOST_USE_CUDA)
+{
+  common::AssertGPUSupport();
+}
+#else
+;  // NOLINT
+#endif
+}  // namespace xgboost
+
+XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                  char const *grad, char const *hess) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  xgboost_CHECK_C_ARG_PTR(grad);
+  xgboost_CHECK_C_ARG_PTR(hess);
+  auto p_fmat = CastDMatrixHandle(dtrain);
+  ArrayInterface<2, false> i_grad{StringView{grad}};
+  ArrayInterface<2, false> i_hess{StringView{hess}};
+  StringView msg{"Mismatched shape between the gradient and hessian."};
+  CHECK_EQ(i_grad.Shape(0), i_hess.Shape(0)) << msg;
+  CHECK_EQ(i_grad.Shape(1), i_hess.Shape(1)) << msg;
+  linalg::Matrix<GradientPair> gpair;
+  auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data);
+  auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data);
+  CHECK_EQ(i_grad.Shape(0), p_fmat->Info().num_row_)
+      << "Mismatched size between the gradient and training data.";
+  CHECK_EQ(grad_is_cuda, hess_is_cuda) << "gradient and hessian should be on the same device.";
+  auto *learner = static_cast<Learner *>(handle);
+  auto ctx = learner->Ctx();
+  if (!grad_is_cuda) {
+    gpair.Reshape(i_grad.Shape(0), i_grad.Shape(1));
+    auto const shape = gpair.Shape();
+    auto h_gpair = gpair.HostView();
+    DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
+      DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {
+        common::ParallelFor(h_gpair.Size(), ctx->Threads(),
+                            detail::CustomGradHessOp{t_grad, t_hess, h_gpair});
+      });
+    });
+  } else {
+    CopyGradientFromCUDAArrays(ctx, i_grad, i_hess, &gpair);
+  }
+  learner->BoostOneIter(iter, p_fmat, &gpair);
  API_END();
 }

--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,8 +1,12 @@
 /**
 * Copyright 2019-2023 by XGBoost Contributors
 */
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
+#include <thrust/transform.h>  // for transform
+
+#include "../common/api_entry.h"       // for XGBAPIThreadLocalEntry
+#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/threading_utils.h"
+#include "../data/array_interface.h"  // for DispatchDType, ArrayInterface
 #include "../data/device_adapter.cuh"
 #include "../data/proxy_dmatrix.h"
 #include "c_api_error.h"
@@ -13,7 +17,6 @@
 #include "xgboost/learner.h"

 namespace xgboost {
-
 void XGBBuildInfoDevice(Json *p_info) {
  auto &info = *p_info;

@@ -55,6 +58,27 @@ void XGBoostAPIGuard::RestoreGPUAttribute() {
  // If errors, do nothing, assuming running on CPU only machine.
  cudaSetDevice(device_id_);
 }
+
+void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
+                                ArrayInterface<2, false> const &hess,
+                                linalg::Matrix<GradientPair> *out_gpair) {
+  auto grad_dev = dh::CudaGetPointerDevice(grad.data);
+  auto hess_dev = dh::CudaGetPointerDevice(hess.data);
+  CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
+  auto &gpair = *out_gpair;
+  gpair.SetDevice(grad_dev);
+  gpair.Reshape(grad.Shape(0), grad.Shape(1));
+  auto d_gpair = gpair.View(grad_dev);
+  auto cuctx = ctx->CUDACtx();
+
+  DispatchDType(grad, DeviceOrd::CUDA(grad_dev), [&](auto &&t_grad) {
+    DispatchDType(hess, DeviceOrd::CUDA(hess_dev), [&](auto &&t_hess) {
+      CHECK_EQ(t_grad.Size(), t_hess.Size());
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), t_grad.Size(),
+                         detail::CustomGradHessOp{t_grad, t_hess, d_gpair});
+    });
+  });
+}
 }                        // namespace xgboost

 using namespace xgboost;  // NOLINT
--- a/src/c_api/c_api_error.h
+++ b/src/c_api/c_api_error.h
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2015-2022 by Contributors
+/**
+ *  Copyright 2015-2023, XGBoost Contributors
 * \file c_api_error.h
 * \brief Error handling for C API.
 */
@@ -35,8 +35,8 @@
  }                                                                            \
  return 0; // NOLINT(*)

-#define CHECK_HANDLE() if (handle == nullptr) \
-  LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
+#define CHECK_HANDLE() \
+  if (handle == nullptr) ::xgboost::detail::EmptyHandle();

 /*!
 * \brief Set the last error message needed by C API
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -7,8 +7,10 @@
 #include <algorithm>
 #include <cstddef>
 #include <functional>
-#include <memory>  // std::shared_ptr
-#include <string>
+#include <memory>   // for shared_ptr
+#include <string>   // for string
+#include <tuple>    // for make_tuple
+#include <utility>  // for move
 #include <vector>

 #include "xgboost/c_api.h"
@@ -16,7 +18,7 @@
 #include "xgboost/feature_map.h"  // for FeatureMap
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
-#include "xgboost/linalg.h"       // ArrayInterfaceHandler
+#include "xgboost/linalg.h"  // ArrayInterfaceHandler, MakeTensorView, ArrayInterfaceStr
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // StringView

@@ -287,6 +289,19 @@ inline std::shared_ptr<DMatrix> CastDMatrixHandle(DMatrixHandle const handle) {
 }

 namespace detail {
+inline void EmptyHandle() {
+  LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
+}
+
+inline xgboost::Context const *BoosterCtx(BoosterHandle handle) {
+  if (handle == nullptr) {
+    EmptyHandle();
+  }
+  auto *learner = static_cast<xgboost::Learner *>(handle);
+  CHECK(learner);
+  return learner->Ctx();
+}
+
 template <typename PtrT, typename I, typename T>
 void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data,
                       std::size_t nindptr, std::string *indptr_str, std::string *indices_str,
@@ -334,6 +349,40 @@ void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data
  Json::Dump(jindices, indices_str);
  Json::Dump(jdata, data_str);
 }
+
+/**
+ * @brief Make array interface for other language bindings.
+ */
+template <typename G, typename H>
+auto MakeGradientInterface(Context const *ctx, G const *grad, H const *hess, std::size_t n_samples,
+                           std::size_t n_targets) {
+  auto t_grad =
+      linalg::MakeTensorView(ctx, common::Span{grad, n_samples * n_targets}, n_samples, n_targets);
+  auto t_hess =
+      linalg::MakeTensorView(ctx, common::Span{hess, n_samples * n_targets}, n_samples, n_targets);
+  auto s_grad = linalg::ArrayInterfaceStr(t_grad);
+  auto s_hess = linalg::ArrayInterfaceStr(t_hess);
+  return std::make_tuple(s_grad, s_hess);
+}
+
+template <typename G, typename H>
+struct CustomGradHessOp {
+  linalg::MatrixView<G> t_grad;
+  linalg::MatrixView<H> t_hess;
+  linalg::MatrixView<GradientPair> d_gpair;
+
+  CustomGradHessOp(linalg::MatrixView<G> t_grad, linalg::MatrixView<H> t_hess,
+                   linalg::MatrixView<GradientPair> d_gpair)
+      : t_grad{std::move(t_grad)}, t_hess{std::move(t_hess)}, d_gpair{std::move(d_gpair)} {}
+
+  XGBOOST_DEVICE void operator()(std::size_t i) {
+    auto [m, n] = linalg::UnravelIndex(i, t_grad.Shape(0), t_grad.Shape(1));
+    auto g = t_grad(m, n);
+    auto h = t_hess(m, n);
+    // from struct of arrays to array of structs.
+    d_gpair(m, n) = GradientPair{static_cast<float>(g), static_cast<float>(h)};
+  }
+};
 }  // namespace detail
 }  // namespace xgboost
 #endif  // XGBOOST_C_API_C_API_UTILS_H_