Pairwise ranking objective implementation on gpu (#4873)
* - pairwise ranking objective implementation on gpu
- there are couple of more algorithms (ndcg and map) for which support will be added
as follow-up pr's
- with no label groups defined, get gradient is 90x faster on gpu (120m instance
mortgage dataset)
- it can perform by an order of magnitude faster with ~ 10 groups (and adequate cores
for the cpu implementation)
* Add JSON config to rank obj.
This commit is contained in:
@@ -12,42 +12,9 @@
|
||||
#include "../common/random.h"
|
||||
#include "param.h"
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||
|
||||
#else // In device code and CUDA < 600
|
||||
XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) {
|
||||
unsigned long long int* address_as_ull =
|
||||
(unsigned long long int*)address; // NOLINT
|
||||
unsigned long long int old = *address_as_ull, assumed; // NOLINT
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed,
|
||||
__double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
|
||||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
|
||||
// NaN)
|
||||
} while (assumed != old);
|
||||
|
||||
return __longlong_as_double(old);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
// Atomic add function for gradients
|
||||
template <typename OutputGradientT, typename InputGradientT>
|
||||
DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
|
||||
const InputGradientT& gpair) {
|
||||
auto dst_ptr = reinterpret_cast<typename OutputGradientT::ValueT*>(dest);
|
||||
|
||||
atomicAdd(dst_ptr,
|
||||
static_cast<typename OutputGradientT::ValueT>(gpair.GetGrad()));
|
||||
atomicAdd(dst_ptr + 1,
|
||||
static_cast<typename OutputGradientT::ValueT>(gpair.GetHess()));
|
||||
}
|
||||
|
||||
struct GPUTrainingParam {
|
||||
// minimum amount of hessian(weight) allowed in a child
|
||||
float min_child_weight;
|
||||
|
||||
@@ -421,7 +421,7 @@ __global__ void SharedMemHistKernel(xgboost::ELLPackMatrix matrix,
|
||||
// global memory
|
||||
GradientSumT* atomic_add_ptr =
|
||||
use_shared_memory_histograms ? smem_arr : d_node_hist;
|
||||
AtomicAddGpair(atomic_add_ptr + gidx, d_gpair[ridx]);
|
||||
dh::AtomicAddGpair(atomic_add_ptr + gidx, d_gpair[ridx]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,7 +430,7 @@ __global__ void SharedMemHistKernel(xgboost::ELLPackMatrix matrix,
|
||||
__syncthreads();
|
||||
for (auto i :
|
||||
dh::BlockStrideRange(static_cast<size_t>(0), matrix.BinCount())) {
|
||||
AtomicAddGpair(d_node_hist + i, smem_arr[i]);
|
||||
dh::AtomicAddGpair(d_node_hist + i, smem_arr[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user