implementation of map ranking algorithm on gpu (#5129)

* - implementation of map ranking algorithm
  - also effected necessary suggestions mentioned in the earlier ranking pr's
  - made some performance improvements to the ndcg algo as well
This commit is contained in:
sriramch
2019-12-26 15:05:38 -08:00
committed by Rory Mitchell
parent 9b0af6e882
commit ee81ba8e1f
6 changed files with 714 additions and 266 deletions

View File

@@ -142,31 +142,92 @@ DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, un
atomicOr(&buffer[ibyte / sizeof(unsigned int)], (unsigned int)b << (ibyte % (sizeof(unsigned int)) * 8));
}
/*!
* \brief Find the strict upper bound for an element in a sorted array
* using binary search.
* \param cuts pointer to the first element of the sorted array
* \param n length of the sorted array
* \param v value for which to find the upper bound
* \return the smallest index i such that v < cuts[i], or n if v is greater or equal
* than all elements of the array
*/
template <typename T>
DEV_INLINE int UpperBound(const T* __restrict__ cuts, int n, T v) {
if (n == 0) { return 0; }
if (cuts[n - 1] <= v) { return n; }
if (cuts[0] > v) { return 0; }
namespace internal {
int left = 0, right = n - 1;
while (right - left > 1) {
int middle = left + (right - left) / 2;
if (cuts[middle] > v) {
right = middle;
// Items of size 'n' are sorted in an order determined by the Comparator
// If left is true, find the number of elements where 'comp(item, v)' returns true;
// 0 if nothing is true
// If left is false, find the number of elements where '!comp(item, v)' returns true;
// 0 if nothing is true
template <typename T, typename Comparator = thrust::greater<T>>
XGBOOST_DEVICE __forceinline__ uint32_t
CountNumItemsImpl(bool left, const T * __restrict__ items, uint32_t n, T v,
const Comparator &comp = Comparator()) {
const T *items_begin = items;
uint32_t num_remaining = n;
const T *middle_item = nullptr;
uint32_t middle;
while (num_remaining > 0) {
middle_item = items_begin;
middle = num_remaining / 2;
middle_item += middle;
if ((left && comp(*middle_item, v)) || (!left && !comp(v, *middle_item))) {
items_begin = ++middle_item;
num_remaining -= middle + 1;
} else {
left = middle;
num_remaining = middle;
}
}
return right;
return left ? items_begin - items : items + n - items_begin;
}
}
/*!
* \brief Find the strict upper bound for an element in a sorted array
* using binary search.
* \param items pointer to the first element of the sorted array
* \param n length of the sorted array
* \param v value for which to find the upper bound
* \param comp determines how the items are sorted ascending/descending order - should conform
* to ordering semantics
* \return the smallest index i that has a value > v, or n if none is larger when sorted ascendingly
* or, an index i with a value < v, or 0 if none is smaller when sorted descendingly
*/
// Preserve existing default behavior of upper bound
template <typename T, typename Comp = thrust::less<T>>
XGBOOST_DEVICE __forceinline__ uint32_t UpperBound(const T *__restrict__ items,
uint32_t n,
T v,
const Comp &comp = Comp()) {
if (std::is_same<Comp, thrust::less<T>>::value ||
std::is_same<Comp, thrust::greater<T>>::value) {
return n - internal::CountNumItemsImpl(false, items, n, v, comp);
} else {
static_assert(std::is_same<Comp, thrust::less<T>>::value ||
std::is_same<Comp, thrust::greater<T>>::value,
"Invalid comparator used in Upperbound - can only be thrust::greater/less");
return std::numeric_limits<uint32_t>::max(); // Simply to quiesce the compiler
}
}
/*!
* \brief Find the strict lower bound for an element in a sorted array
* using binary search.
* \param items pointer to the first element of the sorted array
* \param n length of the sorted array
* \param v value for which to find the upper bound
* \param comp determines how the items are sorted ascending/descending order - should conform
* to ordering semantics
* \return the smallest index i that has a value >= v, or n if none is larger
* when sorted ascendingly
* or, an index i with a value <= v, or 0 if none is smaller when sorted descendingly
*/
template <typename T, typename Comp = thrust::less<T>>
XGBOOST_DEVICE __forceinline__ uint32_t LowerBound(const T *__restrict__ items,
uint32_t n,
T v,
const Comp &comp = Comp()) {
if (std::is_same<Comp, thrust::less<T>>::value ||
std::is_same<Comp, thrust::greater<T>>::value) {
return internal::CountNumItemsImpl(true, items, n, v, comp);
} else {
static_assert(std::is_same<Comp, thrust::less<T>>::value ||
std::is_same<Comp, thrust::greater<T>>::value,
"Invalid comparator used in LowerBound - can only be thrust::greater/less");
return std::numeric_limits<uint32_t>::max(); // Simply to quiesce the compiler
}
}
template <typename T>
@@ -510,7 +571,7 @@ void CopyDeviceSpan(xgboost::common::Span<T> dst,
class BulkAllocator {
std::vector<char *> d_ptr_;
std::vector<size_t> size_;
std::vector<int> device_idx_;
int device_idx_{-1};
static const int kAlign = 256;
@@ -593,14 +654,15 @@ class BulkAllocator {
* This frees the GPU memory managed by this allocator.
*/
void Clear() {
for (size_t i = 0; i < d_ptr_.size(); i++) { // NOLINT(modernize-loop-convert)
if (d_ptr_[i] != nullptr) {
safe_cuda(cudaSetDevice(device_idx_[i]));
XGBDeviceAllocator<char> allocator;
allocator.deallocate(thrust::device_ptr<char>(d_ptr_[i]), size_[i]);
d_ptr_[i] = nullptr;
}
}
if (d_ptr_.empty()) return;
safe_cuda(cudaSetDevice(device_idx_));
size_t idx = 0;
std::for_each(d_ptr_.begin(), d_ptr_.end(), [&](char *dptr) {
XGBDeviceAllocator<char>().deallocate(thrust::device_ptr<char>(dptr), size_[idx++]);
});
d_ptr_.clear();
size_.clear();
}
~BulkAllocator() {
@@ -614,6 +676,8 @@ class BulkAllocator {
template <typename... Args>
void Allocate(int device_idx, Args... args) {
if (device_idx_ == -1) device_idx_ = device_idx;
else CHECK(device_idx_ == device_idx);
size_t size = GetSizeBytes(args...);
char *ptr = AllocateDevice(device_idx, size);
@@ -622,7 +686,6 @@ class BulkAllocator {
d_ptr_.push_back(ptr);
size_.push_back(size);
device_idx_.push_back(device_idx);
}
};