Make binary bin search reusable. (#6058)

* Move binary search row to hist util. * Remove dead code.
2020-08-26 05:05:11 +08:00 · 2020-08-26 05:05:11 +08:00 · 80c8547147
commit 80c8547147
parent 9c14e430af
8 changed files with 56 additions and 102 deletions
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@ -84,6 +84,14 @@
 #define XGBOOST_DEVICE
 #endif  // defined (__CUDA__) || defined(__NVCC__)

+#if defined(__CUDA__) || defined(__CUDACC__)
+#define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
+#define XGBOOST_DEV_INLINE __device__ __forceinline__
+#else
+#define XGBOOST_HOST_DEV_INLINE
+#define XGBOOST_DEV_INLINE
+#endif  // defined(__CUDA__) || defined(__CUDACC__)
+
 // These check are for Makefile.
 #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
 /* default logic for software pre-fetching */
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -96,9 +96,6 @@ T __device__ __forceinline__ atomicAdd(T *addr, T v) {  // NOLINT

 namespace dh {

-#define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
-#define DEV_INLINE __device__ __forceinline__
-
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)

@ -184,9 +181,11 @@ inline void CheckComputeCapability() {
  }
 }

-DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) {
+XGBOOST_DEV_INLINE void AtomicOrByte(unsigned int *__restrict__ buffer,
+                                     size_t ibyte, unsigned char b) {
  atomicOr(&buffer[ibyte / sizeof(unsigned int)],
-           static_cast<unsigned int>(b) << (ibyte % (sizeof(unsigned int)) * 8));
+           static_cast<unsigned int>(b)
+               << (ibyte % (sizeof(unsigned int)) * 8));
 }

 template <typename T>
@ -994,7 +993,7 @@ class SegmentSorter {

 // Atomic add function for gradients
 template <typename OutputGradientT, typename InputGradientT>
-DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
+XGBOOST_DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
                                       const InputGradientT& gpair) {
  auto dst_ptr = reinterpret_cast<typename OutputGradientT::ValueT*>(dest);

--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@ -281,6 +281,33 @@ struct GHistIndexMatrix {
  bool isDense_;
 };

+template <typename GradientIndex>
+int32_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(bst_uint begin, bst_uint end,
+                                                GradientIndex const &data,
+                                                uint32_t const fidx_begin,
+                                                uint32_t const fidx_end) {
+  uint32_t previous_middle = std::numeric_limits<uint32_t>::max();
+  while (end != begin) {
+    auto middle = begin + (end - begin) / 2;
+    if (middle == previous_middle) {
+      break;
+    }
+    previous_middle = middle;
+
+    auto gidx = data[middle];
+
+    if (gidx >= fidx_begin && gidx < fidx_end) {
+      return static_cast<int32_t>(gidx);
+    } else if (gidx < fidx_begin) {
+      begin = middle;
+    } else {
+      end = middle;
+    }
+  }
+  // Value is missing
+  return -1;
+}
+
 struct GHistIndexBlock {
  const size_t* row_ptr;
  const uint32_t* index;
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@ -13,34 +13,6 @@
 #include <thrust/binary_search.h>

 namespace xgboost {
-
-// Find a gidx value for a given feature otherwise return -1 if not found
-__forceinline__ __device__ int BinarySearchRow(
-    bst_uint begin, bst_uint end,
-    common::CompressedIterator<uint32_t> data,
-    int const fidx_begin, int const fidx_end) {
-  bst_uint previous_middle = UINT32_MAX;
-  while (end != begin) {
-    auto middle = begin + (end - begin) / 2;
-    if (middle == previous_middle) {
-      break;
-    }
-    previous_middle = middle;
-
-    auto gidx = data[middle];
-
-    if (gidx >= fidx_begin && gidx < fidx_end) {
-      return gidx;
-    } else if (gidx < fidx_begin) {
-      begin = middle;
-    } else {
-      end = middle;
-    }
-  }
-  // Value is missing
-  return -1;
-}
-
 /** \brief Struct for accessing and manipulating an ellpack matrix on the
 * device. Does not own underlying memory and may be trivially copied into
 * kernels.*/
@ -83,7 +55,7 @@ struct EllpackDeviceAccessor {
    if (is_dense) {
      gidx = gidx_iter[row_begin + fidx];
    } else {
-      gidx = BinarySearchRow(row_begin,
+      gidx = common::BinarySearchBin(row_begin,
                                     row_end,
                                     gidx_iter,
                                     feature_segments[fidx],
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -134,7 +134,7 @@ struct DeviceAdapterLoader {

  using BatchT = Batch;

-  DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
+  XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
                                         bst_feature_t num_features, bst_row_t num_rows,
                                         size_t entry_start) :
    batch{batch},
@ -158,7 +158,7 @@ struct DeviceAdapterLoader {
      __syncthreads();
    }

-  DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
+  XGBOOST_DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
    if (use_shared) {
      return smem[threadIdx.x * columns + fidx];
    }
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@ -34,7 +34,7 @@ namespace tree {
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
 template <typename T>
-DEV_INLINE __host__ T CreateRoundingFactor(T max_abs, int n) {
+XGBOOST_DEV_INLINE __host__ T CreateRoundingFactor(T max_abs, int n) {
  T delta = max_abs / (static_cast<T>(1.0) - 2 * n * std::numeric_limits<T>::epsilon());

  // Calculate ceil(log_2(delta)).
@ -53,20 +53,20 @@ struct Pair {
  GradientPair first;
  GradientPair second;
 };
-DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
+XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
  return {lhs.first + rhs.first, lhs.second + rhs.second};
 }
 }  // anonymous namespace

 struct Clip : public thrust::unary_function<GradientPair, Pair> {
-  static DEV_INLINE float Pclip(float v) {
+  static XGBOOST_DEV_INLINE float Pclip(float v) {
    return v > 0 ? v : 0;
  }
-  static DEV_INLINE float Nclip(float v) {
+  static XGBOOST_DEV_INLINE float Nclip(float v) {
    return v < 0 ? abs(v) : 0;
  }

-  DEV_INLINE Pair operator()(GradientPair x) const {
+  XGBOOST_DEV_INLINE Pair operator()(GradientPair x) const {
    auto pg = Pclip(x.GetGrad());
    auto ph = Pclip(x.GetHess());

--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@ -16,7 +16,7 @@ template <typename GradientSumT>
 GradientSumT CreateRoundingFactor(common::Span<GradientPair const> gpair);

 template <typename T>
-DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x) {
+XGBOOST_DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x) {
  return (rounding_factor + static_cast<T>(x)) - rounding_factor;
 }

--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@ -114,58 +114,6 @@ struct DeviceSplitCandidateReduceOp {
  }
 };

-struct DeviceNodeStats {
-  GradientPair sum_gradients;
-  float root_gain {-FLT_MAX};
-  float weight {-FLT_MAX};
-
-  /** default direction for missing values */
-  DefaultDirection dir {kLeftDir};
-  /** threshold value for comparison */
-  float fvalue {0.0f};
-  GradientPair left_sum;
-  GradientPair right_sum;
-  /** \brief The feature index. */
-  int fidx{kUnusedNode};
-  /** node id (used as key for reduce/scan) */
-  NodeIdT idx{kUnusedNode};
-
-  XGBOOST_DEVICE DeviceNodeStats() {}  // NOLINT
-
-  template <typename ParamT>
-  HOST_DEV_INLINE DeviceNodeStats(GradientPair sum_gradients, NodeIdT nidx,
-                                  const ParamT& param)
-      : sum_gradients(sum_gradients),
-        idx(nidx) {
-    this->root_gain =
-        CalcGain(param, sum_gradients.GetGrad(), sum_gradients.GetHess());
-    this->weight =
-        CalcWeight(param, sum_gradients.GetGrad(), sum_gradients.GetHess());
-  }
-
-  HOST_DEV_INLINE void SetSplit(float fvalue, int fidx, DefaultDirection dir,
-                                GradientPair left_sum, GradientPair right_sum) {
-    this->fvalue = fvalue;
-    this->fidx = fidx;
-    this->dir = dir;
-    this->left_sum = left_sum;
-    this->right_sum = right_sum;
-  }
-
-  HOST_DEV_INLINE void SetSplit(const DeviceSplitCandidate& split) {
-    this->SetSplit(split.fvalue, split.findex, split.dir, split.left_sum,
-                   split.right_sum);
-  }
-
-  /** Tells whether this node is part of the decision tree */
-  HOST_DEV_INLINE bool IsUnused() const { return (idx == kUnusedNode); }
-
-  /** Tells whether this node is a leaf of the decision tree */
-  HOST_DEV_INLINE bool IsLeaf() const {
-    return (!IsUnused() && (fidx == kUnusedNode));
-  }
-};
-
 template <typename T>
 struct SumCallbackOp {
  // Running prefix