Pairwise ranking objective implementation on gpu (#4873)

* - pairwise ranking objective implementation on gpu - there are couple of more algorithms (ndcg and map) for which support will be added as follow-up pr's - with no label groups defined, get gradient is 90x faster on gpu (120m instance mortgage dataset) - it can perform by an order of magnitude faster with ~ 10 groups (and adequate cores for the cpu implementation) * Add JSON config to rank obj.
2019-10-22 20:40:07 -07:00
parent 5620322a48
commit 310fe60b35
8 changed files with 776 additions and 412 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -32,6 +32,27 @@
 #include "../common/io.h"
 #endif

+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+#else  // In device code and CUDA < 600
+XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull =
+      (unsigned long long int*)address;                   // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;  // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
+    // NaN)
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
 namespace dh {

 #define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
@@ -129,7 +150,8 @@ DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, un
 * \return the smallest index i such that v < cuts[i], or n if v is greater or equal
 *  than all elements of the array
 */
-DEV_INLINE int UpperBound(const float* __restrict__ cuts, int n, float v) {
+template <typename T>
+DEV_INLINE int UpperBound(const T* __restrict__ cuts, int n, T v) {
  if (n == 0)           { return 0; }
  if (cuts[n - 1] <= v) { return n; }
  if (cuts[0] > v)      { return 0; }
@@ -235,7 +257,6 @@ class MemoryLogger {
      }
      num_deallocations++;
      CHECK_LE(num_deallocations, num_allocations);
-      CHECK_EQ(itr->second, n);
      currently_allocated_bytes -= itr->second;
      device_allocations.erase(itr);
    }
@@ -269,7 +290,7 @@ public:
    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
      << " ========";
    LOG(CONSOLE) << "Peak memory usage: "
-      << stats_.peak_allocated_bytes / 1000000 << "mb";
+      << stats_.peak_allocated_bytes / 1048576 << "MiB";
    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
  }
 };
@@ -317,8 +338,8 @@ struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
  };
   cub::CachingDeviceAllocator& GetGlobalCachingAllocator ()
   {
-    // Configure allocator with maximum cached bin size of ~1GB and no limit on
-    // maximum cached bytes
+     // Configure allocator with maximum cached bin size of ~1GB and no limit on
+     // maximum cached bytes
     static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
     return *allocator;
   }
@@ -977,7 +998,7 @@ class AllReducer {
    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      LOG(CONSOLE) << "======== NCCL Statistics========";
      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
-      LOG(CONSOLE) << "AllReduce total MB communicated: " << allreduce_bytes_/1000000;
+      LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
    }
 #endif
  }
@@ -1217,4 +1238,16 @@ public:
  }
 };

+// Atomic add function for gradients
+template <typename OutputGradientT, typename InputGradientT>
+DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
+                               const InputGradientT& gpair) {
+  auto dst_ptr = reinterpret_cast<typename OutputGradientT::ValueT*>(dest);
+
+  atomicAdd(dst_ptr,
+            static_cast<typename OutputGradientT::ValueT>(gpair.GetGrad()));
+  atomicAdd(dst_ptr + 1,
+            static_cast<typename OutputGradientT::ValueT>(gpair.GetHess()));
+}
+
 }  // namespace dh