[GPU-Plugin] Major refactor (#2644)

* Removal of redundant code/files. * Removal of exact namespace in GPU plugin * Revert double precision histograms to single precision for performance on Maxwell/Kepler
2017-08-30 10:53:52 +12:00
parent 39adba51c5
commit 19a53814ce
26 changed files with 2170 additions and 5637 deletions
--- a/plugin/updater_gpu/src/updater_gpu.cu
+++ b/plugin/updater_gpu/src/updater_gpu.cu
@@ -1,77 +1,754 @@
 /*!
 * Copyright 2017 XGBoost contributors
 */
-#include "updater_gpu.cuh"
 #include <xgboost/tree_updater.h>
-#include <vector>
 #include <utility>
-#include <string>
-#include "../../../src/common/random.h"
-#include "../../../src/common/sync.h"
+#include <vector>
 #include "../../../src/tree/param.h"
-#include "exact/gpu_builder.cuh"
-#include "gpu_hist_builder.cuh"
+#include "updater_gpu_common.cuh"

 namespace xgboost {
 namespace tree {

-GPUMaker::GPUMaker() : builder(new exact::GPUBuilder<int16_t>()) {}
+DMLC_REGISTRY_FILE_TAG(updater_gpu);

-void GPUMaker::Init(
-    const std::vector<std::pair<std::string, std::string>>& args) {
-  param.InitAllowUnknown(args);
-  builder->Init(param);
+/**
+ * @brief Absolute BFS order IDs to col-wise unique IDs based on user input
+ * @param tid the index of the element that this thread should access
+ * @param abs the array of absolute IDs
+ * @param colIds the array of column IDs for each element
+ * @param nodeStart the start of the node ID at this level
+ * @param nKeys number of nodes at this level.
+ * @return the uniq key
+ */
+
+static HOST_DEV_INLINE node_id_t abs2uniqKey(int tid, const node_id_t* abs,
+                                const int* colIds, node_id_t nodeStart,
+                                int nKeys) {
+  int a = abs[tid];
+  if (a == UNUSED_NODE) return a;
+  return ((a - nodeStart) + (colIds[tid] * nKeys));
 }

-void GPUMaker::Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
-                      const std::vector<RegTree*>& trees) {
-  GradStats::CheckInfo(dmat->info());
-  // rescale learning rate according to size of trees
-  float lr = param.learning_rate;
-  param.learning_rate = lr / trees.size();
-  builder->UpdateParam(param);

-  try {
-    // build tree
-    for (size_t i = 0; i < trees.size(); ++i) {
-      builder->Update(gpair, dmat, trees[i]);
+/**
+ * @struct Pair
+ * @brief Pair used for key basd scan operations on bst_gpair
+ */
+struct Pair {
+  int key;
+  bst_gpair value;
+};
+
+/** define a key that's not used at all in the entire boosting process */
+static const int NONE_KEY = -100;
+
+/**
+ * @brief Allocate temporary buffers needed for scan operations
+ * @param tmpScans gradient buffer
+ * @param tmpKeys keys buffer
+ * @param size number of elements that will be scanned
+ */
+template <int BLKDIM_L1L3 = 256>
+int scanTempBufferSize(int size) {
+  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
+  return nBlks;
+}
+
+struct AddByKey {
+  template <typename T>
+  HOST_DEV_INLINE T operator()(const T& first, const T& second) const {
+    T result;
+    if (first.key == second.key) {
+      result.key = first.key;
+      result.value = first.value + second.value;
+    } else {
+      result.key = second.key;
+      result.value = second.value;
    }
-  } catch (const std::exception& e) {
-    LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
+    return result;
  }
-  param.learning_rate = lr;
+};
+
+/**
+ * @brief Gradient value getter function
+ * @param id the index into the vals or instIds array to which to fetch
+ * @param vals the gradient value buffer
+ * @param instIds instance index buffer
+ * @return the expected gradient value
+ */
+HOST_DEV_INLINE bst_gpair get(int id, const bst_gpair* vals,
+                              const int* instIds) {
+  id = instIds[id];
+  return vals[id];
 }

-GPUHistMaker::GPUHistMaker() : builder(new GPUHistBuilder()) {}
-
-void GPUHistMaker::Init(
-    const std::vector<std::pair<std::string, std::string>>& args) {
-  param.InitAllowUnknown(args);
-  builder->Init(param);
+template <int BLKDIM_L1L3>
+__global__ void cubScanByKeyL1(bst_gpair* scans, const bst_gpair* vals,
+                               const int* instIds, bst_gpair* mScans,
+                               int* mKeys, const node_id_t* keys, int nUniqKeys,
+                               const int* colIds, node_id_t nodeStart,
+                               const int size) {
+  Pair rootPair = {NONE_KEY, bst_gpair(0.f, 0.f)};
+  int myKey;
+  bst_gpair myValue;
+  typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  Pair threadData;
+  int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
+  if (tid < size) {
+    myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
+    myValue = get(tid, vals, instIds);
+  } else {
+    myKey = NONE_KEY;
+    myValue = 0.f;
+  }
+  threadData.key = myKey;
+  threadData.value = myValue;
+  // get previous key, especially needed for the last thread in this block
+  // in order to pass on the partial scan values.
+  // this statement MUST appear before the checks below!
+  // else, the result of this shuffle operation will be undefined
+  int previousKey = __shfl_up(myKey, 1);
+  // Collectively compute the block-wide exclusive prefix sum
+  BlockScan(temp_storage)
+      .ExclusiveScan(threadData, threadData, rootPair, AddByKey());
+  if (tid < size) {
+    scans[tid] = threadData.value;
+  } else {
+    return;
+  }
+  if (threadIdx.x == BLKDIM_L1L3 - 1) {
+    threadData.value =
+        (myKey == previousKey) ? threadData.value : bst_gpair(0.0f, 0.0f);
+    mKeys[blockIdx.x] = myKey;
+    mScans[blockIdx.x] = threadData.value + myValue;
+  }
 }

-void GPUHistMaker::Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
-                          const std::vector<RegTree*>& trees) {
-  GradStats::CheckInfo(dmat->info());
-  // rescale learning rate according to size of trees
-  float lr = param.learning_rate;
-  param.learning_rate = lr / trees.size();
-  builder->UpdateParam(param);
-  // build tree
-  try {
-    for (size_t i = 0; i < trees.size(); ++i) {
-      builder->Update(gpair, dmat, trees[i]);
+template <int BLKSIZE>
+__global__ void cubScanByKeyL2(bst_gpair* mScans, int* mKeys, int mLength) {
+  typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
+  Pair threadData;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
+    threadData.key = mKeys[i];
+    threadData.value = mScans[i];
+    BlockScan(temp_storage).InclusiveScan(threadData, threadData, AddByKey());
+    mScans[i] = threadData.value;
+    __syncthreads();
+  }
+}
+
+template <int BLKDIM_L1L3>
+__global__ void cubScanByKeyL3(bst_gpair* sums, bst_gpair* scans,
+                               const bst_gpair* vals, const int* instIds,
+                               const bst_gpair* mScans, const int* mKeys,
+                               const node_id_t* keys, int nUniqKeys,
+                               const int* colIds, node_id_t nodeStart,
+                               const int size) {
+  int relId = threadIdx.x;
+  int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
+  // to avoid the following warning from nvcc:
+  //   __shared__ memory variable with non-empty constructor or destructor
+  //     (potential race between threads)
+  __shared__ char gradBuff[sizeof(bst_gpair)];
+  __shared__ int s_mKeys;
+  bst_gpair* s_mScans = reinterpret_cast<bst_gpair*>(gradBuff);
+  if (tid >= size) return;
+  // cache block-wide partial scan info
+  if (relId == 0) {
+    s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : NONE_KEY;
+    s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : bst_gpair();
+  }
+  int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
+  int previousKey =
+      tid == 0 ? NONE_KEY
+               : abs2uniqKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
+  bst_gpair myValue = scans[tid];
+  __syncthreads();
+  if (blockIdx.x > 0 && s_mKeys == previousKey) {
+    myValue += s_mScans[0];
+  }
+  if (tid == size - 1) {
+    sums[previousKey] = myValue + get(tid, vals, instIds);
+  }
+  if ((previousKey != myKey) && (previousKey >= 0)) {
+    sums[previousKey] = myValue;
+    myValue = bst_gpair(0.0f, 0.0f);
+  }
+  scans[tid] = myValue;
+}
+
+/**
+ * @brief Performs fused reduce and scan by key functionality. It is assumed
+ * that
+ *  the keys occur contiguously!
+ * @param sums the output gradient reductions for each element performed
+ * key-wise
+ * @param scans the output gradient scans for each element performed key-wise
+ * @param vals the gradients evaluated for each observation.
+ * @param instIds instance ids for each element
+ * @param keys keys to be used to segment the reductions. They need not occur
+ *  contiguously in contrast to scan_by_key. Currently, we need one key per
+ *  value in the 'vals' array.
+ * @param size number of elements in the 'vals' array
+ * @param nUniqKeys max number of uniq keys found per column
+ * @param nCols number of columns
+ * @param tmpScans temporary scan buffer needed for cub-pyramid algo
+ * @param tmpKeys temporary key buffer needed for cub-pyramid algo
+ * @param colIds column indices for each element in the array
+ * @param nodeStart index of the leftmost node in the current level
+ */
+template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
+void reduceScanByKey(bst_gpair* sums, bst_gpair* scans, const bst_gpair* vals,
+                     const int* instIds, const node_id_t* keys, int size,
+                     int nUniqKeys, int nCols, bst_gpair* tmpScans,
+                     int* tmpKeys, const int* colIds, node_id_t nodeStart) {
+  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
+  cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(bst_gpair));
+  cubScanByKeyL1<BLKDIM_L1L3>
+      <<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
+                               nUniqKeys, colIds, nodeStart, size);
+  cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
+  cubScanByKeyL3<BLKDIM_L1L3>
+      <<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys,
+                               keys, nUniqKeys, colIds, nodeStart, size);
+}
+
+/**
+ * @struct ExactSplitCandidate
+ * @brief Abstraction of a possible split in the decision tree
+ */
+struct ExactSplitCandidate {
+  /** the optimal gain score for this node */
+  float score;
+  /** index where to split in the DMatrix */
+  int index;
+
+  HOST_DEV_INLINE ExactSplitCandidate() : score(-FLT_MAX), index(INT_MAX) {}
+
+  /**
+   * @brief Whether the split info is valid to be used to create a new child
+   * @param minSplitLoss minimum score above which decision to split is made
+   * @return true if splittable, else false
+   */
+  HOST_DEV_INLINE bool isSplittable(float minSplitLoss) const {
+    return ((score >= minSplitLoss) && (index != INT_MAX));
+  }
+};
+
+/**
+ * @enum ArgMaxByKeyAlgo best_split_evaluation.cuh
+ * @brief Help decide which algorithm to use for multi-argmax operation
+ */
+enum ArgMaxByKeyAlgo {
+  /** simplest, use gmem-atomics for all updates */
+  ABK_GMEM = 0,
+  /** use smem-atomics for updates (when number of keys are less) */
+  ABK_SMEM
+};
+
+/** max depth until which to use shared mem based atomics for argmax */
+static const int MAX_ABK_LEVELS = 3;
+
+HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
+                                             ExactSplitCandidate b) {
+  ExactSplitCandidate out;
+  if (a.score < b.score) {
+    out.score = b.score;
+    out.index = b.index;
+  } else if (a.score == b.score) {
+    out.score = a.score;
+    out.index = (a.index < b.index) ? a.index : b.index;
+  } else {
+    out.score = a.score;
+    out.index = a.index;
+  }
+  return out;
+}
+
+DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
+                             ExactSplitCandidate val) {
+  unsigned long long* intAddress = (unsigned long long*)address;  // NOLINT
+  unsigned long long old = *intAddress;                           // NOLINT
+  unsigned long long assumed;                                     // NOLINT
+  do {
+    assumed = old;
+    ExactSplitCandidate res =
+        maxSplit(val, *reinterpret_cast<ExactSplitCandidate*>(&assumed));
+    old = atomicCAS(intAddress, assumed, *reinterpret_cast<uint64_t*>(&res));
+  } while (assumed != old);
+}
+
+DEV_INLINE void argMaxWithAtomics(
+    int id, ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
+    const bst_gpair* gradSums, const float* vals, const int* colIds,
+    const node_id_t* nodeAssigns, const DeviceDenseNode* nodes, int nUniqKeys,
+    node_id_t nodeStart, int len, const GPUTrainingParam& param) {
+  int nodeId = nodeAssigns[id];
+  // @todo: this is really a bad check! but will be fixed when we move
+  //  to key-based reduction
+  if ((id == 0) ||
+      !((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
+        (vals[id] == vals[id - 1]))) {
+    if (nodeId != UNUSED_NODE) {
+      int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
+      bst_gpair colSum = gradSums[sumId];
+      int uid = nodeId - nodeStart;
+      DeviceDenseNode n = nodes[nodeId];
+      bst_gpair parentSum = n.sum_gradients;
+      float parentGain = n.root_gain;
+      bool tmp;
+      ExactSplitCandidate s;
+      bst_gpair missing = parentSum - colSum;
+      s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain,
+                                 param, tmp);
+      s.index = id;
+      atomicArgMax(nodeSplits + uid, s);
+    }  // end if nodeId != UNUSED_NODE
+  }    // end if id == 0 ...
+}
+
+__global__ void atomicArgMaxByKeyGmem(
+    ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
+    const bst_gpair* gradSums, const float* vals, const int* colIds,
+    const node_id_t* nodeAssigns, const DeviceDenseNode* nodes, int nUniqKeys,
+    node_id_t nodeStart, int len, const TrainParam param) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < len; id += stride) {
+    argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
+                      nodeAssigns, nodes, nUniqKeys, nodeStart, len,
+                      GPUTrainingParam(param));
+  }
+}
+
+__global__ void atomicArgMaxByKeySmem(
+    ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
+    const bst_gpair* gradSums, const float* vals, const int* colIds,
+    const node_id_t* nodeAssigns, const DeviceDenseNode* nodes, int nUniqKeys,
+    node_id_t nodeStart, int len, const TrainParam param) {
+  extern __shared__ char sArr[];
+  ExactSplitCandidate* sNodeSplits =
+      reinterpret_cast<ExactSplitCandidate*>(sArr);
+  int tid = threadIdx.x;
+  ExactSplitCandidate defVal;
+#pragma unroll 1
+  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
+    sNodeSplits[i] = defVal;
+  }
+  __syncthreads();
+  int id = tid + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < len; id += stride) {
+    argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
+                      nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
+  }
+  __syncthreads();
+  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
+    ExactSplitCandidate s = sNodeSplits[i];
+    atomicArgMax(nodeSplits + i, s);
+  }
+}
+
+/**
+ * @brief Performs argmax_by_key functionality but for cases when keys need not
+ *  occur contiguously
+ * @param nodeSplits will contain information on best split for each node
+ * @param gradScans exclusive sum on sorted segments for each col
+ * @param gradSums gradient sum for each column in DMatrix based on to node-ids
+ * @param vals feature values
+ * @param colIds column index for each element in the feature values array
+ * @param nodeAssigns node-id assignments to each element in DMatrix
+ * @param nodes pointer to all nodes for this tree in BFS order
+ * @param nUniqKeys number of unique node-ids in this level
+ * @param nodeStart start index of the node-ids in this level
+ * @param len number of elements
+ * @param param training parameters
+ * @param algo which algorithm to use for argmax_by_key
+ */
+template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
+void argMaxByKey(ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
+                 const bst_gpair* gradSums, const float* vals,
+                 const int* colIds, const node_id_t* nodeAssigns,
+                 const DeviceDenseNode* nodes, int nUniqKeys,
+                 node_id_t nodeStart, int len, const TrainParam param,
+                 ArgMaxByKeyAlgo algo) {
+  dh::fillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
+      dh::get_device_idx(param.gpu_id), nodeSplits, nUniqKeys,
+      ExactSplitCandidate());
+  int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD * BLKDIM);
+  switch (algo) {
+    case ABK_GMEM:
+      atomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
+          nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+          nUniqKeys, nodeStart, len, param);
+      break;
+    case ABK_SMEM:
+      atomicArgMaxByKeySmem<<<nBlks, BLKDIM,
+                              sizeof(ExactSplitCandidate) * nUniqKeys>>>(
+          nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+          nUniqKeys, nodeStart, len, param);
+      break;
+    default:
+      throw std::runtime_error("argMaxByKey: Bad algo passed!");
+  }
+}
+
+__global__ void assignColIds(int* colIds, const int* colOffsets) {
+  int myId = blockIdx.x;
+  int start = colOffsets[myId];
+  int end = colOffsets[myId + 1];
+  for (int id = start + threadIdx.x; id < end; id += blockDim.x) {
+    colIds[id] = myId;
+  }
+}
+
+__global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
+                                   const DeviceDenseNode* nodes, int nRows) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (id >= nRows) {
+    return;
+  }
+  // if this element belongs to none of the currently active node-id's
+  node_id_t nId = nodeIdsPerInst[id];
+  if (nId == UNUSED_NODE) {
+    return;
+  }
+  const DeviceDenseNode n = nodes[nId];
+  node_id_t result;
+  if (n.IsLeaf() || n.IsUnused()) {
+    result = UNUSED_NODE;
+  } else if (n.dir == LeftDir) {
+    result = (2 * n.idx) + 1;
+  } else {
+    result = (2 * n.idx) + 2;
+  }
+  nodeIdsPerInst[id] = result;
+}
+
+__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
+                              const node_id_t* nodeIds, const int* instId,
+                              const DeviceDenseNode* nodes,
+                              const int* colOffsets, const float* vals,
+                              int nVals, int nCols) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < nVals; id += stride) {
+    // fusing generation of indices for node locations
+    nodeLocations[id] = id;
+    // using nodeIds here since the previous kernel would have updated
+    // the nodeIdsPerInst with all default assignments
+    int nId = nodeIds[id];
+    // if this element belongs to none of the currently active node-id's
+    if (nId != UNUSED_NODE) {
+      const DeviceDenseNode n = nodes[nId];
+      int colId = n.fidx;
+      // printf("nid=%d colId=%d id=%d\n", nId, colId, id);
+      int start = colOffsets[colId];
+      int end = colOffsets[colId + 1];
+      // @todo: too much wasteful threads!!
+      if ((id >= start) && (id < end) && !(n.IsLeaf() || n.IsUnused())) {
+        node_id_t result = (2 * n.idx) + 1 + (vals[id] >= n.fvalue);
+        nodeIdsPerInst[instId[id]] = result;
+      }
    }
-  } catch (const std::exception& e) {
-    LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
  }
-  param.learning_rate = lr;
 }

-bool GPUHistMaker::UpdatePredictionCache(const DMatrix* data,
-                                         std::vector<bst_float>* out_preds) {
-  return builder->UpdatePredictionCache(data, out_preds);
+__global__ void markLeavesKernel(DeviceDenseNode* nodes, int len) {
+  int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if ((id < len) && !nodes[id].IsUnused()) {
+    int lid = (id << 1) + 1;
+    int rid = (id << 1) + 2;
+    if ((lid >= len) || (rid >= len)) {
+      nodes[id].root_gain = -FLT_MAX;  // bottom-most nodes
+    } else if (nodes[lid].IsUnused() && nodes[rid].IsUnused()) {
+      nodes[id].root_gain = -FLT_MAX;  // unused child nodes
+    }
+  }
 }

+class GPUMaker : public TreeUpdater {
+ protected:
+  TrainParam param;
+  /** whether we have initialized memory already (so as not to repeat!) */
+  bool allocated;
+  /** feature values stored in column-major compressed format */
+  dh::dvec2<float> vals;
+  dh::dvec<float> vals_cached;
+  /** corresponding instance id's of these featutre values */
+  dh::dvec2<int> instIds;
+  dh::dvec<int> instIds_cached;
+  /** column offsets for these feature values */
+  dh::dvec<int> colOffsets;
+  dh::dvec<bst_gpair> gradsInst;
+  dh::dvec2<node_id_t> nodeAssigns;
+  dh::dvec2<int> nodeLocations;
+  dh::dvec<DeviceDenseNode> nodes;
+  dh::dvec<node_id_t> nodeAssignsPerInst;
+  dh::dvec<bst_gpair> gradSums;
+  dh::dvec<bst_gpair> gradScans;
+  dh::dvec<ExactSplitCandidate> nodeSplits;
+  int nVals;
+  int nRows;
+  int nCols;
+  int maxNodes;
+  int maxLeaves;
+  dh::CubMemory tmp_mem;
+  dh::dvec<bst_gpair> tmpScanGradBuff;
+  dh::dvec<int> tmpScanKeyBuff;
+  dh::dvec<int> colIds;
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
+
+ public:
+  GPUMaker() : allocated(false) {}
+  ~GPUMaker() {}
+
+  void Init(
+      const std::vector<std::pair<std::string, std::string>>& args) override {
+    param.InitAllowUnknown(args);
+    maxNodes = (1 << (param.max_depth + 1)) - 1;
+    maxLeaves = 1 << param.max_depth;
+  }
+
+  void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
+              const std::vector<RegTree*>& trees) override {
+    GradStats::CheckInfo(dmat->info());
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+
+    try {
+      // build tree
+      for (size_t i = 0; i < trees.size(); ++i) {
+        UpdateTree(gpair, dmat, trees[i]);
+      }
+    } catch (const std::exception& e) {
+      LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
+    }
+    param.learning_rate = lr;
+  }
+  /// @note: Update should be only after Init!!
+  void UpdateTree(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
+                  RegTree* hTree) {
+    if (!allocated) {
+      setupOneTimeData(dmat);
+    }
+    for (int i = 0; i < param.max_depth; ++i) {
+      if (i == 0) {
+        // make sure to start on a fresh tree with sorted values!
+        vals.current_dvec() = vals_cached;
+        instIds.current_dvec() = instIds_cached;
+        transferGrads(gpair);
+      }
+      int nNodes = 1 << i;
+      node_id_t nodeStart = nNodes - 1;
+      initNodeData(i, nodeStart, nNodes);
+      findSplit(i, nodeStart, nNodes);
+    }
+    // mark all the used nodes with unused children as leaf nodes
+    markLeaves();
+    dense2sparse_tree(hTree, nodes, param);
+  }
+
+  void split2node(int nNodes, node_id_t nodeStart) {
+    auto d_nodes = nodes.data();
+    auto d_gradScans = gradScans.data();
+    auto d_gradSums = gradSums.data();
+    auto d_nodeAssigns = nodeAssigns.current();
+    auto d_colIds = colIds.data();
+    auto d_vals = vals.current();
+    auto d_nodeSplits = nodeSplits.data();
+    int nUniqKeys = nNodes;
+    float min_split_loss = param.min_split_loss;
+    auto gpu_param = GPUTrainingParam(param);
+
+    dh::launch_n(param.gpu_id, nNodes, [=] __device__(int uid) {
+      int absNodeId = uid + nodeStart;
+      ExactSplitCandidate s = d_nodeSplits[uid];
+      if (s.isSplittable(min_split_loss)) {
+        int idx = s.index;
+        int nodeInstId =
+            abs2uniqKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
+        bool missingLeft = true;
+        const DeviceDenseNode& n = d_nodes[absNodeId];
+        bst_gpair gradScan = d_gradScans[idx];
+        bst_gpair gradSum = d_gradSums[nodeInstId];
+        float thresh = d_vals[idx];
+        int colId = d_colIds[idx];
+        // get the default direction for the current node
+        bst_gpair missing = n.sum_gradients - gradSum;
+        loss_chg_missing(gradScan, missing, n.sum_gradients, n.root_gain,
+                         gpu_param, missingLeft);
+        // get the score/weight/id/gradSum for left and right child nodes
+        bst_gpair lGradSum = missingLeft ? gradScan + missing : gradScan;
+        bst_gpair rGradSum = n.sum_gradients - lGradSum;
+
+        // Create children
+        d_nodes[left_child_nidx(absNodeId)] =
+            DeviceDenseNode(lGradSum, left_child_nidx(absNodeId), gpu_param);
+        d_nodes[right_child_nidx(absNodeId)] =
+            DeviceDenseNode(rGradSum, right_child_nidx(absNodeId), gpu_param);
+        // Set split for parent
+        d_nodes[absNodeId].SetSplit(thresh, colId,
+                                    missingLeft ? LeftDir : RightDir);
+      } else {
+        // cannot be split further, so this node is a leaf!
+        d_nodes[absNodeId].root_gain = -FLT_MAX;
+      }
+    });
+  }
+
+  void findSplit(int level, node_id_t nodeStart, int nNodes) {
+    reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
+                    instIds.current(), nodeAssigns.current(), nVals, nNodes,
+                    nCols, tmpScanGradBuff.data(), tmpScanKeyBuff.data(),
+                    colIds.data(), nodeStart);
+    argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
+                vals.current(), colIds.data(), nodeAssigns.current(),
+                nodes.data(), nNodes, nodeStart, nVals, param,
+                level <= MAX_ABK_LEVELS ? ABK_SMEM : ABK_GMEM);
+    split2node(nNodes, nodeStart);
+  }
+
+  void allocateAllData(int offsetSize) {
+    int tmpBuffSize = scanTempBufferSize(nVals);
+    ba.allocate(dh::get_device_idx(param.gpu_id), param.silent, &vals, nVals,
+                &vals_cached, nVals, &instIds, nVals, &instIds_cached, nVals,
+                &colOffsets, offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
+                &nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
+                nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals,
+                &nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize,
+                &tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
+  }
+
+  void setupOneTimeData(DMatrix* dmat) {
+    size_t free_memory = dh::available_memory(dh::get_device_idx(param.gpu_id));
+    if (!dmat->SingleColBlock()) {
+      throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
+    }
+    std::vector<float> fval;
+    std::vector<int> fId, offset;
+    convertToCsc(dmat, &fval, &fId, &offset);
+    allocateAllData(static_cast<int>(offset.size()));
+    transferAndSortData(fval, fId, offset);
+    allocated = true;
+  }
+
+  void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
+                    std::vector<int>* fId, std::vector<int>* offset) {
+    MetaInfo info = dmat->info();
+    nRows = info.num_row;
+    nCols = info.num_col;
+    offset->reserve(nCols + 1);
+    offset->push_back(0);
+    fval->reserve(nCols * nRows);
+    fId->reserve(nCols * nRows);
+    // in case you end up with a DMatrix having no column access
+    // then make sure to enable that before copying the data!
+    if (!dmat->HaveColAccess()) {
+      const std::vector<bool> enable(nCols, true);
+      dmat->InitColAccess(enable, 1, nRows);
+    }
+    dmlc::DataIter<ColBatch>* iter = dmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch& batch = iter->Value();
+      for (int i = 0; i < batch.size; i++) {
+        const ColBatch::Inst& col = batch[i];
+        for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
+             it++) {
+          int inst_id = static_cast<int>(it->index);
+          fval->push_back(it->fvalue);
+          fId->push_back(inst_id);
+        }
+        offset->push_back(fval->size());
+      }
+    }
+    nVals = fval->size();
+  }
+
+  void transferAndSortData(const std::vector<float>& fval,
+                           const std::vector<int>& fId,
+                           const std::vector<int>& offset) {
+    vals.current_dvec() = fval;
+    instIds.current_dvec() = fId;
+    colOffsets = offset;
+    dh::segmentedSort<float, int>(&tmp_mem, &vals, &instIds, nVals, nCols,
+                              colOffsets);
+    vals_cached = vals.current_dvec();
+    instIds_cached = instIds.current_dvec();
+    assignColIds<<<nCols, 512>>>(colIds.data(), colOffsets.data());
+  }
+
+  void transferGrads(const std::vector<bst_gpair>& gpair) {
+    // HACK
+    dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
+                             sizeof(bst_gpair) * nRows,
+                             cudaMemcpyHostToDevice));
+    // evaluate the full-grad reduction for the root node
+    dh::sumReduction<bst_gpair>(tmp_mem, gradsInst, gradSums, nRows);
+  }
+
+  void initNodeData(int level, node_id_t nodeStart, int nNodes) {
+    // all instances belong to root node at the beginning!
+    if (level == 0) {
+      nodes.fill(DeviceDenseNode());
+      nodeAssigns.current_dvec().fill(0);
+      nodeAssignsPerInst.fill(0);
+      // for root node, just update the gradient/score/weight/id info
+      // before splitting it! Currently all data is on GPU, hence this
+      // stupid little kernel
+      auto d_nodes = nodes.data();
+      auto d_sums = gradSums.data();
+      auto gpu_params = GPUTrainingParam(param);
+      dh::launch_n(param.gpu_id, 1, [=] __device__(int idx) {
+        d_nodes[0] = DeviceDenseNode(d_sums[0], 0, gpu_params);
+      });
+    } else {
+      const int BlkDim = 256;
+      const int ItemsPerThread = 4;
+      // assign default node ids first
+      int nBlks = dh::div_round_up(nRows, BlkDim);
+      fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.data(),
+                                            nodes.data(), nRows);
+      // evaluate the correct child indices of non-missing values next
+      nBlks = dh::div_round_up(nVals, BlkDim * ItemsPerThread);
+      assignNodeIds<<<nBlks, BlkDim>>>(
+          nodeAssignsPerInst.data(), nodeLocations.current(),
+          nodeAssigns.current(), instIds.current(), nodes.data(),
+          colOffsets.data(), vals.current(), nVals, nCols);
+      // gather the node assignments across all other columns too
+      dh::gather(dh::get_device_idx(param.gpu_id), nodeAssigns.current(),
+             nodeAssignsPerInst.data(), instIds.current(), nVals);
+      sortKeys(level);
+    }
+  }
+
+  void sortKeys(int level) {
+    // segmented-sort the arrays based on node-id's
+    // but we don't need more than level+1 bits for sorting!
+    segmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,
+                  colOffsets, 0, level + 1);
+    dh::gather<float, int>(dh::get_device_idx(param.gpu_id), vals.other(),
+                       vals.current(), instIds.other(), instIds.current(),
+                       nodeLocations.current(), nVals);
+    vals.buff().selector ^= 1;
+    instIds.buff().selector ^= 1;
+  }
+
+  void markLeaves() {
+    const int BlkDim = 128;
+    int nBlks = dh::div_round_up(maxNodes, BlkDim);
+    markLeavesKernel<<<nBlks, BlkDim>>>(nodes.data(), maxNodes);
+  }
+};
+
+XGBOOST_REGISTER_TREE_UPDATER(GPUMaker, "grow_gpu")
+    .describe("Grow tree with GPU.")
+    .set_body([]() { return new GPUMaker(); });
+
 }  // namespace tree
 }  // namespace xgboost