diff --git a/include/xgboost/base.h b/include/xgboost/base.h index cf30b969c..814e26982 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -84,6 +84,14 @@ #define XGBOOST_DEVICE #endif // defined (__CUDA__) || defined(__NVCC__) +#if defined(__CUDA__) || defined(__CUDACC__) +#define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__ +#define XGBOOST_DEV_INLINE __device__ __forceinline__ +#else +#define XGBOOST_HOST_DEV_INLINE +#define XGBOOST_DEV_INLINE +#endif // defined(__CUDA__) || defined(__CUDACC__) + // These check are for Makefile. #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) /* default logic for software pre-fetching */ diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 5e4f1eae0..b3c05ce23 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -96,9 +96,6 @@ T __device__ __forceinline__ atomicAdd(T *addr, T v) { // NOLINT namespace dh { -#define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__ -#define DEV_INLINE __device__ __forceinline__ - #ifdef XGBOOST_USE_NCCL #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__) @@ -184,9 +181,11 @@ inline void CheckComputeCapability() { } } -DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) { +XGBOOST_DEV_INLINE void AtomicOrByte(unsigned int *__restrict__ buffer, + size_t ibyte, unsigned char b) { atomicOr(&buffer[ibyte / sizeof(unsigned int)], - static_cast(b) << (ibyte % (sizeof(unsigned int)) * 8)); + static_cast(b) + << (ibyte % (sizeof(unsigned int)) * 8)); } template @@ -994,8 +993,8 @@ class SegmentSorter { // Atomic add function for gradients template -DEV_INLINE void AtomicAddGpair(OutputGradientT* dest, - const InputGradientT& gpair) { +XGBOOST_DEV_INLINE void AtomicAddGpair(OutputGradientT* dest, + const InputGradientT& gpair) { auto dst_ptr = reinterpret_cast(dest); atomicAdd(dst_ptr, diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 0334b9012..ab1defa0e 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -281,6 +281,33 @@ struct GHistIndexMatrix { bool isDense_; }; +template +int32_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(bst_uint begin, bst_uint end, + GradientIndex const &data, + uint32_t const fidx_begin, + uint32_t const fidx_end) { + uint32_t previous_middle = std::numeric_limits::max(); + while (end != begin) { + auto middle = begin + (end - begin) / 2; + if (middle == previous_middle) { + break; + } + previous_middle = middle; + + auto gidx = data[middle]; + + if (gidx >= fidx_begin && gidx < fidx_end) { + return static_cast(gidx); + } else if (gidx < fidx_begin) { + begin = middle; + } else { + end = middle; + } + } + // Value is missing + return -1; +} + struct GHistIndexBlock { const size_t* row_ptr; const uint32_t* index; diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 8cb0162fb..ce2648f2a 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -13,34 +13,6 @@ #include namespace xgboost { - -// Find a gidx value for a given feature otherwise return -1 if not found -__forceinline__ __device__ int BinarySearchRow( - bst_uint begin, bst_uint end, - common::CompressedIterator data, - int const fidx_begin, int const fidx_end) { - bst_uint previous_middle = UINT32_MAX; - while (end != begin) { - auto middle = begin + (end - begin) / 2; - if (middle == previous_middle) { - break; - } - previous_middle = middle; - - auto gidx = data[middle]; - - if (gidx >= fidx_begin && gidx < fidx_end) { - return gidx; - } else if (gidx < fidx_begin) { - begin = middle; - } else { - end = middle; - } - } - // Value is missing - return -1; -} - /** \brief Struct for accessing and manipulating an ellpack matrix on the * device. Does not own underlying memory and may be trivially copied into * kernels.*/ @@ -83,11 +55,11 @@ struct EllpackDeviceAccessor { if (is_dense) { gidx = gidx_iter[row_begin + fidx]; } else { - gidx = BinarySearchRow(row_begin, - row_end, - gidx_iter, - feature_segments[fidx], - feature_segments[fidx + 1]); + gidx = common::BinarySearchBin(row_begin, + row_end, + gidx_iter, + feature_segments[fidx], + feature_segments[fidx + 1]); } return gidx; } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index a36a131fa..9380df399 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -134,9 +134,9 @@ struct DeviceAdapterLoader { using BatchT = Batch; - DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared, - bst_feature_t num_features, bst_row_t num_rows, - size_t entry_start) : + XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared, + bst_feature_t num_features, bst_row_t num_rows, + size_t entry_start) : batch{batch}, columns{num_features}, use_shared{use_shared} { @@ -158,7 +158,7 @@ struct DeviceAdapterLoader { __syncthreads(); } - DEV_INLINE float GetElement(size_t ridx, size_t fidx) const { + XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const { if (use_shared) { return smem[threadIdx.x * columns + fidx]; } diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index aa4fe9958..adf973817 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -34,7 +34,7 @@ namespace tree { * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree. */ template -DEV_INLINE __host__ T CreateRoundingFactor(T max_abs, int n) { +XGBOOST_DEV_INLINE __host__ T CreateRoundingFactor(T max_abs, int n) { T delta = max_abs / (static_cast(1.0) - 2 * n * std::numeric_limits::epsilon()); // Calculate ceil(log_2(delta)). @@ -53,20 +53,20 @@ struct Pair { GradientPair first; GradientPair second; }; -DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) { +XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) { return {lhs.first + rhs.first, lhs.second + rhs.second}; } } // anonymous namespace struct Clip : public thrust::unary_function { - static DEV_INLINE float Pclip(float v) { + static XGBOOST_DEV_INLINE float Pclip(float v) { return v > 0 ? v : 0; } - static DEV_INLINE float Nclip(float v) { + static XGBOOST_DEV_INLINE float Nclip(float v) { return v < 0 ? abs(v) : 0; } - DEV_INLINE Pair operator()(GradientPair x) const { + XGBOOST_DEV_INLINE Pair operator()(GradientPair x) const { auto pg = Pclip(x.GetGrad()); auto ph = Pclip(x.GetHess()); diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 6b2675ebc..84c79568f 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -16,7 +16,7 @@ template GradientSumT CreateRoundingFactor(common::Span gpair); template -DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x) { +XGBOOST_DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x) { return (rounding_factor + static_cast(x)) - rounding_factor; } diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index 63da94ada..6599ee2e0 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -114,58 +114,6 @@ struct DeviceSplitCandidateReduceOp { } }; -struct DeviceNodeStats { - GradientPair sum_gradients; - float root_gain {-FLT_MAX}; - float weight {-FLT_MAX}; - - /** default direction for missing values */ - DefaultDirection dir {kLeftDir}; - /** threshold value for comparison */ - float fvalue {0.0f}; - GradientPair left_sum; - GradientPair right_sum; - /** \brief The feature index. */ - int fidx{kUnusedNode}; - /** node id (used as key for reduce/scan) */ - NodeIdT idx{kUnusedNode}; - - XGBOOST_DEVICE DeviceNodeStats() {} // NOLINT - - template - HOST_DEV_INLINE DeviceNodeStats(GradientPair sum_gradients, NodeIdT nidx, - const ParamT& param) - : sum_gradients(sum_gradients), - idx(nidx) { - this->root_gain = - CalcGain(param, sum_gradients.GetGrad(), sum_gradients.GetHess()); - this->weight = - CalcWeight(param, sum_gradients.GetGrad(), sum_gradients.GetHess()); - } - - HOST_DEV_INLINE void SetSplit(float fvalue, int fidx, DefaultDirection dir, - GradientPair left_sum, GradientPair right_sum) { - this->fvalue = fvalue; - this->fidx = fidx; - this->dir = dir; - this->left_sum = left_sum; - this->right_sum = right_sum; - } - - HOST_DEV_INLINE void SetSplit(const DeviceSplitCandidate& split) { - this->SetSplit(split.fvalue, split.findex, split.dir, split.left_sum, - split.right_sum); - } - - /** Tells whether this node is part of the decision tree */ - HOST_DEV_INLINE bool IsUnused() const { return (idx == kUnusedNode); } - - /** Tells whether this node is a leaf of the decision tree */ - HOST_DEV_INLINE bool IsLeaf() const { - return (!IsUnused() && (fidx == kUnusedNode)); - } -}; - template struct SumCallbackOp { // Running prefix