Pairwise ranking objective implementation on gpu (#4873)
* - pairwise ranking objective implementation on gpu
- there are couple of more algorithms (ndcg and map) for which support will be added
as follow-up pr's
- with no label groups defined, get gradient is 90x faster on gpu (120m instance
mortgage dataset)
- it can perform by an order of magnitude faster with ~ 10 groups (and adequate cores
for the cpu implementation)
* Add JSON config to rank obj.
This commit is contained in:
@@ -32,6 +32,27 @@
|
||||
#include "../common/io.h"
|
||||
#endif
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||
|
||||
#else // In device code and CUDA < 600
|
||||
XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) {
|
||||
unsigned long long int* address_as_ull =
|
||||
(unsigned long long int*)address; // NOLINT
|
||||
unsigned long long int old = *address_as_ull, assumed; // NOLINT
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed,
|
||||
__double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
|
||||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
|
||||
// NaN)
|
||||
} while (assumed != old);
|
||||
|
||||
return __longlong_as_double(old);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace dh {
|
||||
|
||||
#define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
|
||||
@@ -129,7 +150,8 @@ DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, un
|
||||
* \return the smallest index i such that v < cuts[i], or n if v is greater or equal
|
||||
* than all elements of the array
|
||||
*/
|
||||
DEV_INLINE int UpperBound(const float* __restrict__ cuts, int n, float v) {
|
||||
template <typename T>
|
||||
DEV_INLINE int UpperBound(const T* __restrict__ cuts, int n, T v) {
|
||||
if (n == 0) { return 0; }
|
||||
if (cuts[n - 1] <= v) { return n; }
|
||||
if (cuts[0] > v) { return 0; }
|
||||
@@ -235,7 +257,6 @@ class MemoryLogger {
|
||||
}
|
||||
num_deallocations++;
|
||||
CHECK_LE(num_deallocations, num_allocations);
|
||||
CHECK_EQ(itr->second, n);
|
||||
currently_allocated_bytes -= itr->second;
|
||||
device_allocations.erase(itr);
|
||||
}
|
||||
@@ -269,7 +290,7 @@ public:
|
||||
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
|
||||
<< " ========";
|
||||
LOG(CONSOLE) << "Peak memory usage: "
|
||||
<< stats_.peak_allocated_bytes / 1000000 << "mb";
|
||||
<< stats_.peak_allocated_bytes / 1048576 << "MiB";
|
||||
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
|
||||
}
|
||||
};
|
||||
@@ -317,8 +338,8 @@ struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
|
||||
};
|
||||
cub::CachingDeviceAllocator& GetGlobalCachingAllocator ()
|
||||
{
|
||||
// Configure allocator with maximum cached bin size of ~1GB and no limit on
|
||||
// maximum cached bytes
|
||||
// Configure allocator with maximum cached bin size of ~1GB and no limit on
|
||||
// maximum cached bytes
|
||||
static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
|
||||
return *allocator;
|
||||
}
|
||||
@@ -977,7 +998,7 @@ class AllReducer {
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MB communicated: " << allreduce_bytes_/1000000;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -1217,4 +1238,16 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// Atomic add function for gradients
|
||||
template <typename OutputGradientT, typename InputGradientT>
|
||||
DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
|
||||
const InputGradientT& gpair) {
|
||||
auto dst_ptr = reinterpret_cast<typename OutputGradientT::ValueT*>(dest);
|
||||
|
||||
atomicAdd(dst_ptr,
|
||||
static_cast<typename OutputGradientT::ValueT>(gpair.GetGrad()));
|
||||
atomicAdd(dst_ptr + 1,
|
||||
static_cast<typename OutputGradientT::ValueT>(gpair.GetHess()));
|
||||
}
|
||||
|
||||
} // namespace dh
|
||||
|
||||
Reference in New Issue
Block a user