GPU binning and compression. (#3319)

* GPU binning and compression. - binning and index compression are done inside the DeviceShard constructor - in case of a DMatrix with multiple row batches, it is first converted into a single row batch
2018-06-05 07:15:13 +02:00
parent 3f7696ff53
commit 286dccb8e8
10 changed files with 302 additions and 67 deletions
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -8,6 +8,10 @@
 #include <cstddef>
 #include <algorithm>

+#ifdef __CUDACC__
+#include "device_helpers.cuh"
+#endif
+
 namespace xgboost {
 namespace common {

@@ -96,6 +100,23 @@ class CompressedBufferWriter {
      }
    }
  }
+
+#ifdef __CUDACC__
+  __device__ void AtomicWriteSymbol
+    (CompressedByteT* buffer, uint64_t symbol, size_t offset) {
+    size_t ibit_start = offset * symbol_bits_;
+    size_t ibit_end = (offset + 1) * symbol_bits_ - 1;
+    size_t ibyte_start = ibit_start / 8, ibyte_end = ibit_end / 8;
+
+    symbol <<= 7 - ibit_end % 8;
+    for (ptrdiff_t ibyte = ibyte_end; ibyte >= (ptrdiff_t)ibyte_start; --ibyte) {
+      dh::AtomicOrByte(reinterpret_cast<unsigned int*>(buffer + detail::kPadding),
+                   ibyte, symbol & 0xff);
+      symbol >>= 8;
+    }
+  }
+#endif
+
  template <typename IterT>
  void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
    uint64_t tmp = 0;
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -122,6 +122,14 @@ inline size_t AvailableMemory(int device_idx) {
  return device_free;
 }

+inline size_t TotalMemory(int device_idx) {
+  size_t device_free = 0;
+  size_t device_total = 0;
+  safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
+  return device_total;
+}
+
 /**
 * \fn  inline int max_shared_memory(int device_idx)
 *
@@ -155,6 +163,12 @@ inline void CheckComputeCapability() {
  }
 }

+
+DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) {
+  atomicOr(&buffer[ibyte / sizeof(unsigned int)], (unsigned int)b << (ibyte % (sizeof(unsigned int)) * 8));
+}
+
+
 /*
 * Range iterator
 */