Implement fit stump. (#8607)

2023-01-04 04:14:51 +08:00
parent 20e6087579
commit 8d545ab2a2
23 changed files with 421 additions and 60 deletions
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -0,0 +1,82 @@
+/**
+ * Copyright 2022 by XGBoost Contributors
+ *
+ * \brief Utilities for estimating initial score.
+ */
+#include "fit_stump.h"
+
+#include <cinttypes>  // std::int32_t
+#include <cstddef>    // std::size_t
+
+#include "../collective/communicator-inl.h"
+#include "../common/common.h"              // AssertGPUSupport
+#include "../common/numeric.h"             // cpu_impl::Reduce
+#include "../common/threading_utils.h"     // ParallelFor
+#include "../common/transform_iterator.h"  // MakeIndexTransformIter
+#include "xgboost/base.h"                  // bst_target_t, GradientPairPrecise
+#include "xgboost/context.h"               // Context
+#include "xgboost/linalg.h"                // TensorView, Tensor, Constant
+#include "xgboost/logging.h"               // CHECK_EQ
+
+namespace xgboost {
+namespace tree {
+namespace cpu_impl {
+void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
+              linalg::VectorView<float> out) {
+  auto n_targets = out.Size();
+  CHECK_EQ(n_targets, gpair.Shape(1));
+  linalg::Tensor<GradientPairPrecise, 2> sum_tloc =
+      linalg::Constant(ctx, GradientPairPrecise{}, ctx->Threads(), n_targets);
+  auto h_sum_tloc = sum_tloc.HostView();
+  // first dim for gpair is samples, second dim is target.
+  // Reduce by column, parallel by samples
+  common::ParallelFor(gpair.Shape(0), ctx->Threads(), [&](auto i) {
+    for (bst_target_t t = 0; t < n_targets; ++t) {
+      h_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)};
+    }
+  });
+  // Aggregate to the first row.
+  auto h_sum = h_sum_tloc.Slice(0, linalg::All());
+  for (std::int32_t i = 1; i < ctx->Threads(); ++i) {
+    for (bst_target_t j = 0; j < n_targets; ++j) {
+      h_sum(j) += h_sum_tloc(i, j);
+    }
+  }
+  CHECK(h_sum.CContiguous());
+  collective::Allreduce<collective::Operation::kSum>(
+      reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+
+  for (std::size_t i = 0; i < h_sum.Size(); ++i) {
+    out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
+  }
+}
+}  // namespace cpu_impl
+
+namespace cuda_impl {
+void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
+              linalg::VectorView<float> out);
+
+#if !defined(XGBOOST_USE_CUDA)
+inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
+                     linalg::VectorView<float>) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace cuda_impl
+
+void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+              bst_target_t n_targets, linalg::Vector<float>* out) {
+  out->SetDevice(ctx->gpu_id);
+  out->Reshape(n_targets);
+  auto n_samples = gpair.Size() / n_targets;
+
+  gpair.SetDevice(ctx->gpu_id);
+  linalg::TensorView<GradientPair const, 2> gpair_t{
+      ctx->IsCPU() ? gpair.ConstHostSpan() : gpair.ConstDeviceSpan(),
+      {n_samples, n_targets},
+      ctx->gpu_id};
+  ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView())
+               : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
+}
+}  // namespace tree
+}  // namespace xgboost
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 by XGBoost Contributors
+ *
+ * \brief Utilities for estimating initial score.
+ */
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif                                            // !defined(NOMINMAX)
+#include <thrust/execution_policy.h>              // cuda::par
+#include <thrust/iterator/counting_iterator.h>    // thrust::make_counting_iterator
+
+#include <cstddef>                                // std::size_t
+
+#include "../collective/device_communicator.cuh"  // DeviceCommunicator
+#include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
+#include "fit_stump.h"
+#include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
+#include "xgboost/context.h"  // Context
+#include "xgboost/linalg.h"   // TensorView, Tensor, Constant
+#include "xgboost/logging.h"  // CHECK_EQ
+#include "xgboost/span.h"     // span
+
+namespace xgboost {
+namespace tree {
+namespace cuda_impl {
+void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
+              linalg::VectorView<float> out) {
+  auto n_targets = out.Size();
+  CHECK_EQ(n_targets, gpair.Shape(1));
+  linalg::Vector<GradientPairPrecise> sum = linalg::Constant(ctx, GradientPairPrecise{}, n_targets);
+  CHECK(out.Contiguous());
+
+  // Reduce by column
+  auto key_it = dh::MakeTransformIterator<bst_target_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) -> bst_target_t { return i / gpair.Shape(0); });
+  auto grad_it = dh::MakeTransformIterator<GradientPairPrecise>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) -> GradientPairPrecise {
+        auto target = i / gpair.Shape(0);
+        auto sample = i % gpair.Shape(0);
+        return GradientPairPrecise{gpair(sample, target)};
+      });
+  auto d_sum = sum.View(ctx->gpu_id);
+  CHECK(d_sum.CContiguous());
+
+  dh::XGBCachingDeviceAllocator<char> alloc;
+  auto policy = thrust::cuda::par(alloc);
+  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
+                        thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
+
+  collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(ctx->gpu_id);
+  communicator->AllReduceSum(reinterpret_cast<double*>(d_sum.Values().data()), d_sum.Size() * 2);
+
+  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
+                     [=] XGBOOST_DEVICE(std::size_t i) mutable {
+                       out(i) = static_cast<float>(
+                           CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
+                     });
+}
+}  // namespace cuda_impl
+}  // namespace tree
+}  // namespace xgboost
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2022 by XGBoost Contributors
+ *
+ * \brief Utilities for estimating initial score.
+ */
+
+#ifndef XGBOOST_TREE_FIT_STUMP_H_
+#define XGBOOST_TREE_FIT_STUMP_H_
+
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+#include <algorithm>  // std::max
+
+#include "../common/common.h"            // AssertGPUSupport
+#include "xgboost/base.h"                // GradientPair
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // TensorView
+
+namespace xgboost {
+namespace tree {
+
+template <typename T>
+XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
+  return -sum_grad / std::max(sum_hess, static_cast<double>(kRtEps));
+}
+
+/**
+ * @brief Fit a tree stump as an estimation of base_score.
+ */
+void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+              bst_target_t n_targets, linalg::Vector<float>* out);
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_FIT_STUMP_H_