Support multi-target, fit intercept for hinge. (#9850)

2023-12-08 05:50:41 +08:00
parent 39c637ee19
commit 42de9206fc
8 changed files with 221 additions and 155 deletions
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -1,31 +1,48 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_LINALG_OP_CUH_
 #define XGBOOST_COMMON_LINALG_OP_CUH_

-#include "device_helpers.cuh"
+#include <cstdint>  // for int32_t
+#include <cstdlib>  // for size_t
+#include <tuple>    // for apply
+
+#include "device_helpers.cuh"  // for LaunchN
 #include "linalg_op.h"
-#include "xgboost/context.h"
-#include "xgboost/linalg.h"
+#include "xgboost/context.h"  // for Context
+#include "xgboost/linalg.h"   // for TensorView

 namespace xgboost {
 namespace linalg {
-template <typename T, int32_t D, typename Fn>
-void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
-  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
-  static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
-                "For function with return, use transform instead.");
-  if (t.Contiguous()) {
-    auto ptr = t.Values().data();
-    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable { fn(i, ptr[i]); });
-  } else {
-    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
-      T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
-      fn(i, v);
+namespace cuda_impl {
+// Use template specialization to dispatch, Windows + CUDA 11.8 doesn't support extended
+// lambda inside constexpr if
+template <typename T, std::int32_t D>
+struct ElementWiseImpl {
+  template <typename Fn>
+  void operator()(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
+    static_assert(D > 1);
+    dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) mutable {
+      std::apply(fn, linalg::UnravelIndex(i, t.Shape()));
    });
  }
+};
+
+template <typename T>
+struct ElementWiseImpl<T, 1> {
+  template <typename Fn>
+  void operator()(linalg::TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
+    dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) { fn(i); });
+  }
+};
+
+template <typename T, std::int32_t D, typename Fn>
+void ElementWiseKernel(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
+  cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
 }
+}  // namespace cuda_impl

 template <typename T, int32_t D, typename Fn>
 void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
@@ -42,7 +59,8 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_

 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn);
+  ctx->IsCUDA() ? cuda_impl::ElementWiseKernel(t, fn)
+                : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
 }  // namespace linalg
 }  // namespace xgboost
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_LINALG_OP_H_
 #define XGBOOST_COMMON_LINALG_OP_H_
@@ -27,17 +27,23 @@ void ElementWiseTransformHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&
  }
 }

-template <typename T, int32_t D, typename Fn>
-void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
-  static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
-                "For function with return, use transform instead.");
-  if (t.Contiguous()) {
-    auto ptr = t.Values().data();
-    common::ParallelFor(t.Size(), n_threads, [&](size_t i) { fn(i, ptr[i]); });
+template <typename T, std::int32_t D, typename Fn>
+void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn &&fn) {
+  if constexpr (D == 1) {
+    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) { fn(i); });
+  } else if (D == 2 && t.CContiguous() && t.Shape(0) > t.Shape(1) * 64) {
+    // Heuristic. Tall, c-contiguous matrix,
+    auto n_rows = t.Shape(0);
+    auto n_columns = t.Shape(1);
+    common::ParallelFor(n_rows, n_threads, [&](std::size_t i) {
+      for (std::size_t j = 0; j < n_columns; ++j) {
+        fn(i, j);
+      }
+    });
  } else {
-    common::ParallelFor(t.Size(), n_threads, [&](size_t i) {
-      auto& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
-      fn(i, v);
+    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) {
+      auto idx = linalg::UnravelIndex(i, t.Shape());
+      std::apply(fn, idx);
    });
  }
 }