Calculate base_score based on input labels for mae. (#8107)

Fit an intercept as base score for abs loss.
2022-09-20 20:53:54 +08:00
parent 4f42aa5f12
commit fffb1fca52
42 changed files with 999 additions and 343 deletions
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -1,7 +1,8 @@
 /*!
 * Copyright 2022 by XGBoost Contributors
 */
-#pragma once
+#ifndef XGBOOST_COMMON_ALGORITHM_H_
+#define XGBOOST_COMMON_ALGORITHM_H_
 #include <algorithm>  // std::upper_bound
 #include <cinttypes>  // std::size_t

@@ -14,3 +15,4 @@ auto SegmentId(It first, It last, Idx idx) {
 }
 }  // namespace common
 }  // namespace xgboost
+#endif  // XGBOOST_COMMON_ALGORITHM_H_
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -265,6 +265,7 @@ struct OptionalWeights {
  explicit OptionalWeights(float w) : dft{w} {}

  XGBOOST_DEVICE float operator[](size_t i) const { return weights.empty() ? dft : weights[i]; }
+  auto Empty() const { return weights.empty(); }
 };

 /**
@@ -276,7 +277,7 @@ XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
 }

 /**
- * @brief A CRTP (curiously recurring template pattern) helper function.
+ * \brief A CRTP (curiously recurring template pattern) helper function.
 *
 * https://www.fluentcpp.com/2017/05/19/crtp-helper/
 *
@@ -284,7 +285,7 @@ XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
 * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
 * 2. Avoids having to `static_cast` in a lot of places.
 *
- * @tparam T The derived class in a CRTP hierarchy.
+ * \tparam T The derived class in a CRTP hierarchy.
 */
 template <typename T>
 struct Crtp {
@@ -292,6 +293,13 @@ struct Crtp {
  T const &Underlying() const { return static_cast<T const &>(*this); }
 };

+/**
+ * \brief C++17 std::as_const
+ */
+template <typename T>
+typename std::add_const<T>::type &AsConst(T &v) noexcept {  // NOLINT(runtime/references)
+  return v;
+}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -4,6 +4,7 @@
 #ifndef XGBOOST_COMMON_LINALG_OP_H_
 #define XGBOOST_COMMON_LINALG_OP_H_
 #include <type_traits>
+#include <cstdint>  // std::int32_t

 #include "common.h"
 #include "threading_utils.h"
@@ -59,6 +60,31 @@ void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t,
  ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
+
+template <typename T, std::int32_t kDim>
+auto cbegin(TensorView<T, kDim> v) {  // NOLINT
+  auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
+    return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
+  });
+  return it;
+}
+
+template <typename T, std::int32_t kDim>
+auto cend(TensorView<T, kDim> v) {  // NOLINT
+  return cbegin(v) + v.Size();
+}
+
+template <typename T, std::int32_t kDim>
+auto begin(TensorView<T, kDim> v) {  // NOLINT
+  auto it = common::MakeIndexTransformIter(
+      [&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
+  return it;
+}
+
+template <typename T, std::int32_t kDim>
+auto end(TensorView<T, kDim> v) {  // NOLINT
+  return begin(v) + v.Size();
+}
 }  // namespace linalg
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_LINALG_OP_H_
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -0,0 +1,28 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include "numeric.h"
+
+#include <numeric>      // std::accumulate
+#include <type_traits>  // std::is_same
+
+#include "threading_utils.h"             // MemStackAllocator, ParallelFor, DefaultMaxThreads
+#include "xgboost/generic_parameters.h"  // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+
+namespace xgboost {
+namespace common {
+double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
+  if (ctx->IsCPU()) {
+    auto const& h_values = values.ConstHostVector();
+    MemStackAllocator<double, DefaultMaxThreads()> result_tloc(ctx->Threads(), 0);
+    ParallelFor(h_values.size(), ctx->Threads(),
+                [&](auto i) { result_tloc[omp_get_thread_num()] += h_values[i]; });
+    auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cend(), 0.0);
+    static_assert(std::is_same<decltype(result), double>::value, "");
+    return result;
+  }
+  return cuda::Reduce(ctx, values);
+}
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -0,0 +1,25 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>  // thrust:plus
+
+#include "device_helpers.cuh"  // dh::Reduce, safe_cuda, dh::XGBCachingDeviceAllocator
+#include "numeric.h"
+#include "xgboost/generic_parameters.h"  // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+
+namespace xgboost {
+namespace common {
+namespace cuda {
+double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
+  values.SetDevice(ctx->gpu_id);
+  auto const d_values = values.ConstDeviceSpan();
+  dh::XGBCachingDeviceAllocator<char> alloc;
+  auto res = dh::Reduce(thrust::cuda::par(alloc), d_values.data(),
+                        d_values.data() + d_values.size(), 0.0, thrust::plus<double>{});
+  return res;
+}
+}  // namespace cuda
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -8,8 +8,10 @@
 #include <iterator>   // std::iterator_traits
 #include <vector>

-#include "threading_utils.h"
-#include "xgboost/generic_parameters.h"
+#include "common.h"                      // AssertGPUSupport
+#include "threading_utils.h"             // MemStackAllocator, DefaultMaxThreads
+#include "xgboost/generic_parameters.h"  // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector

 namespace xgboost {
 namespace common {
@@ -18,8 +20,8 @@ namespace common {
 * \brief Run length encode on CPU, input must be sorted.
 */
 template <typename Iter, typename Idx>
-void RunLengthEncode(Iter begin, Iter end, std::vector<Idx> *p_out) {
-  auto &out = *p_out;
+void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
+  auto& out = *p_out;
  out = std::vector<Idx>{0};
  size_t n = std::distance(begin, end);
  for (size_t i = 1; i < n; ++i) {
@@ -45,7 +47,7 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
  auto n = static_cast<size_t>(std::distance(begin, end));
  const size_t batch_threads =
      std::max(static_cast<size_t>(1), std::min(n, static_cast<size_t>(n_threads)));
-  common::MemStackAllocator<T, 128> partial_sums(batch_threads);
+  MemStackAllocator<T, DefaultMaxThreads()> partial_sums(batch_threads);

  size_t block_size = n / batch_threads;

@@ -90,6 +92,20 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
  }
  exc.Rethrow();
 }
+
+namespace cuda {
+double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
+#if !defined(XGBOOST_USE_CUDA)
+inline double Reduce(Context const*, HostDeviceVector<float> const&) {
+  AssertGPUSupport();
+  return 0;
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace cuda
+/**
+ * \brief Reduction with summation.
+ */
+double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
 }  // namespace common
 }  // namespace xgboost

--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -0,0 +1,47 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+
+#include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
+
+#include "common.h"            // common::OptionalWeights
+#include "device_helpers.cuh"  // dh::MakeTransformIterator, tcbegin, tcend
+#include "stats.cuh"           // common::SegmentedQuantile, common::SegmentedWeightedQuantile
+#include "xgboost/generic_parameters.h"  // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply
+
+namespace xgboost {
+namespace common {
+namespace cuda {
+float Median(Context const* ctx, linalg::TensorView<float const, 2> t,
+             common::OptionalWeights weights) {
+  HostDeviceVector<size_t> segments{0, t.Size()};
+  segments.SetDevice(ctx->gpu_id);
+  auto d_segments = segments.ConstDeviceSpan();
+  auto val_it = dh::MakeTransformIterator<float>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
+        return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+      });
+
+  HostDeviceVector<float> quantile{0};
+  quantile.SetDevice(ctx->gpu_id);
+  if (weights.Empty()) {
+    common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
+                              val_it + t.Size(), &quantile);
+  } else {
+    CHECK_NE(t.Shape(1), 0);
+    auto w_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                 [=] XGBOOST_DEVICE(size_t i) {
+                                                   auto sample_idx = i / t.Shape(1);
+                                                   return weights[sample_idx];
+                                                 });
+    common::SegmentedWeightedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments),
+                                      val_it, val_it + t.Size(), w_it, w_it + t.Size(), &quantile);
+  }
+  CHECK_EQ(quantile.Size(), 1);
+  return quantile.HostVector().front();
+}
+}  // namespace cuda
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/stats.h
+++ b/src/common/stats.h
@@ -8,7 +8,8 @@
 #include <limits>
 #include <vector>

-#include "common.h"
+#include "common.h"  // AssertGPUSupport
+#include "xgboost/generic_parameters.h"
 #include "xgboost/linalg.h"

 namespace xgboost {
@@ -90,6 +91,44 @@ float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
  idx = std::min(idx, static_cast<size_t>(n - 1));
  return val(idx);
 }
+
+namespace cuda {
+float Median(Context const* ctx, linalg::TensorView<float const, 2> t,
+             common::OptionalWeights weights);
+#if !defined(XGBOOST_USE_CUDA)
+inline float Median(Context const*, linalg::TensorView<float const, 2>, common::OptionalWeights) {
+  AssertGPUSupport();
+  return 0;
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace cuda
+
+inline float Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
+                    HostDeviceVector<float> const& weights) {
+  if (!ctx->IsCPU()) {
+    weights.SetDevice(ctx->gpu_id);
+    auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
+    auto t_v = t.View(ctx->gpu_id);
+    return cuda::Median(ctx, t_v, opt_weights);
+  }
+
+  auto opt_weights = OptionalWeights(weights.ConstHostSpan());
+  auto t_v = t.HostView();
+  auto iter = common::MakeIndexTransformIter(
+      [&](size_t i) { return linalg::detail::Apply(t_v, linalg::UnravelIndex(i, t_v.Shape())); });
+  float q{0};
+  if (opt_weights.Empty()) {
+    q = common::Quantile(0.5, iter, iter + t_v.Size());
+  } else {
+    CHECK_NE(t_v.Shape(1), 0);
+    auto w_it = common::MakeIndexTransformIter([&](size_t i) {
+      auto sample_idx = i / t_v.Shape(1);
+      return opt_weights[sample_idx];
+    });
+    q = common::WeightedQuantile(0.5, iter, iter + t_v.Size(), w_it);
+  }
+  return q;
+}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_STATS_H_
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -8,6 +8,7 @@
 #include <dmlc/omp.h>

 #include <algorithm>
+#include <cstdint>  // std::int32_t
 #include <limits>
 #include <type_traits>  // std::is_signed
 #include <vector>
@@ -253,7 +254,7 @@ inline int32_t OmpGetNumThreads(int32_t n_threads) {
 * MaxStackSize, it will be allocated inside the stack. Otherwise, it will be
 * heap-allocated.
 */
-template <typename T, size_t MaxStackSize>
+template <typename T, std::size_t MaxStackSize>
 class MemStackAllocator {
 public:
  explicit MemStackAllocator(size_t required_size) : required_size_(required_size) {
@@ -278,11 +279,23 @@ class MemStackAllocator {
  T& operator[](size_t i) { return ptr_[i]; }
  T const& operator[](size_t i) const { return ptr_[i]; }

+  auto data() const { return ptr_; }                   // NOLINT
+  auto data() { return ptr_; }                         // NOLINT
+  std::size_t size() const { return required_size_; }  // NOLINT
+
+  auto cbegin() const { return data(); }         // NOLINT
+  auto cend() const { return data() + size(); }  // NOLINT
+
 private:
  T* ptr_ = nullptr;
  size_t required_size_;
  T stack_mem_[MaxStackSize];
 };
+
+/**
+ * \brief Constant that can be used for initializing static thread local memory.
+ */
+std::int32_t constexpr DefaultMaxThreads() { return 128; }
 }  // namespace common
 }  // namespace xgboost