Added finding quantiles on GPU. (#3393)

* Added finding quantiles on GPU. - this includes datasets where weights are assigned to data rows - as the quantiles found by the new algorithm are not the same as those found by the old one, test thresholds in tests/python-gpu/test_gpu_updaters.py have been adjusted. * Adjustments and improved testing for finding quantiles on the GPU. - added C++ tests for the DeviceSketch() function - reduced one of the thresholds in test_gpu_updaters.py - adjusted the cuts found by the find_cuts_k kernel
2018-07-27 04:03:16 +02:00
parent e2f09db77a
commit cc6a5a3666
14 changed files with 691 additions and 116 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -163,11 +163,41 @@ inline void CheckComputeCapability() {
  }
 }

-
+  
 DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) {
  atomicOr(&buffer[ibyte / sizeof(unsigned int)], (unsigned int)b << (ibyte % (sizeof(unsigned int)) * 8));
 }

+/*! 
+ * \brief Find the strict upper bound for an element in a sorted array
+ *  using binary search.
+ * \param cuts pointer to the first element of the sorted array
+ * \param n length of the sorted array
+ * \param v value for which to find the upper bound
+ * \return the smallest index i such that v < cuts[i], or n if v is greater or equal
+ *  than all elements of the array
+*/
+DEV_INLINE int UpperBound(const float* __restrict__ cuts, int n, float v) {
+  if (n == 0) {
+    return 0;
+	}
+  if (cuts[n - 1] <= v) {
+    return n;
+	}
+  if (cuts[0] > v) {
+    return 0;
+	}
+  int left = 0, right = n - 1;
+  while (right - left > 1) {
+    int middle = left + (right - left) / 2;
+    if (cuts[middle] > v) {
+      right = middle;
+    } else {
+      left = middle;
+		}
+  }
+  return right;
+}

 /*
 * Range iterator
@@ -252,6 +282,18 @@ T1 DivRoundUp(const T1 a, const T2 b) {
  return static_cast<T1>(ceil(static_cast<double>(a) / b));
 }

+inline void RowSegments(size_t n_rows, size_t n_devices, std::vector<size_t>* segments) {
+  segments->push_back(0);
+  size_t row_begin = 0;
+  size_t shard_size = DivRoundUp(n_rows, n_devices);
+  for (size_t d_idx = 0; d_idx < n_devices; ++d_idx) {
+    size_t row_end = std::min(row_begin + shard_size, n_rows);
+    segments->push_back(row_end);
+    row_begin = row_end;
+  }
+}
+
+
 template <typename L>
 __global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
  for (auto i : GridStrideRange(begin, end)) {