[GPU-Plugin] Integration of a faster version of grow_gpu plugin into mainstream (#2360)

* Integrating a faster version of grow_gpu plugin 1. Removed the older files to reduce duplication 2. Moved all of the grow_gpu files under 'exact' folder 3. All of them are inside 'exact' namespace to avoid any conflicts 4. Fixed a bug in benchmark.py while running only 'grow_gpu' plugin 5. Added cub and googletest submodules to ease integration and unit-testing 6. Updates to CMakeLists.txt to directly build cuda objects into libxgboost * Added support for building gpu plugins through make flow 1. updated makefile and config.mk to add right targets 2. added unit-tests for gpu exact plugin code * 1. Added support for building gpu plugin using 'make' flow as well 2. Updated instructions for building and testing gpu plugin * Fix travis-ci errors for PR#2360 1. lint errors on unit-tests 2. removed googletest, instead depended upon dmlc-core provide gtest cache * Some more fixes to travis-ci lint failures PR#2360 * Added Rory's copyrights to the files containing code from both. * updated copyright statement as per Rory's request * moved the static datasets into a script to generate them at runtime * 1. memory usage print when silent=0 2. tests/ and test/ folder organization 3. removal of the dependency of googletest for just building xgboost 4. coding style updates for .cuh as well * Fixes for compilation warnings * add cuda object files as well when JVM_BINDINGS=ON
2017-06-06 03:09:53 +05:30
parent 2d9052bc7d
commit 85b2fb3eee
37 changed files with 4118 additions and 1601 deletions
--- a/plugin/updater_gpu/src/exact/argmax_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/argmax_by_key.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/tree/param.h"
+#include "../common.cuh"
+#include "node.cuh"
+#include "loss_functions.cuh"
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @enum ArgMaxByKeyAlgo best_split_evaluation.cuh
+ * @brief Help decide which algorithm to use for multi-argmax operation
+ */
+enum ArgMaxByKeyAlgo {
+  /** simplest, use gmem-atomics for all updates */
+  ABK_GMEM = 0,
+  /** use smem-atomics for updates (when number of keys are less) */
+  ABK_SMEM
+};
+
+/** max depth until which to use shared mem based atomics for argmax */
+static const int MAX_ABK_LEVELS = 3;
+
+HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
+  Split out;
+  if (a.score < b.score) {
+    out.score = b.score;
+    out.index = b.index;
+  } else if (a.score == b.score) {
+    out.score = a.score;
+    out.index = (a.index < b.index)? a.index : b.index;
+  } else {
+    out.score = a.score;
+    out.index = a.index;
+  }
+  return out;
+}
+
+DEV_INLINE void atomicArgMax(Split* address, Split val) {
+  unsigned long long* intAddress = (unsigned long long*) address;
+  unsigned long long old = *intAddress;
+  unsigned long long assumed;
+  do {
+    assumed = old;
+    Split res = maxSplit(val, *(Split*)&assumed);
+    old = atomicCAS(intAddress, assumed, *(unsigned long long*)&res);
+  } while (assumed != old);
+}
+
+template <typename node_id_t>
+DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
+                                  const gpu_gpair* gradScans,
+                                  const gpu_gpair* gradSums, const float* vals,
+                                  const int* colIds,
+                                  const node_id_t* nodeAssigns,
+                                  const Node<node_id_t>* nodes, int nUniqKeys,
+                                  node_id_t nodeStart, int len,
+                                  const TrainParam &param) {
+  int nodeId = nodeAssigns[id];
+  ///@todo: this is really a bad check! but will be fixed when we move
+  ///   to key-based reduction
+  if ((id == 0) || !((nodeId == nodeAssigns[id-1]) &&
+                     (colIds[id] == colIds[id-1]) &&
+                     (vals[id] == vals[id-1]))) {
+    if (nodeId != UNUSED_NODE) {
+      int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart,
+                              nUniqKeys);
+      gpu_gpair colSum = gradSums[sumId];
+      int uid = nodeId - nodeStart;
+      Node<node_id_t> n = nodes[nodeId];
+      gpu_gpair parentSum = n.gradSum;
+      float parentGain = n.score;
+      bool tmp;
+      Split s;
+      gpu_gpair missing = parentSum - colSum;
+      s.score = loss_chg_missing(gradScans[id], missing, parentSum,
+                                 parentGain, param, tmp);
+      s.index = id;
+      atomicArgMax(nodeSplits+uid, s);
+    } // end if nodeId != UNUSED_NODE
+  } // end if id == 0 ...
+}
+
+template <typename node_id_t>
+__global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
+                                      const gpu_gpair* gradScans,
+                                      const gpu_gpair* gradSums,
+                                      const float* vals, const int* colIds,
+                                      const node_id_t* nodeAssigns,
+                                      const Node<node_id_t>* nodes, int nUniqKeys,
+                                      node_id_t nodeStart, int len,
+                                      const TrainParam param) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < len; id += stride) {
+    argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
+                      nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
+  }
+}
+
+template <typename node_id_t>
+__global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
+                                      const gpu_gpair* gradScans,
+                                      const gpu_gpair* gradSums,
+                                      const float* vals, const int* colIds,
+                                      const node_id_t* nodeAssigns,
+                                      const Node<node_id_t>* nodes, int nUniqKeys,
+                                      node_id_t nodeStart, int len,
+                                      const TrainParam param) {
+  extern __shared__ char sArr[];
+  Split* sNodeSplits = (Split*)sArr;
+  int tid = threadIdx.x;
+  Split defVal;
+  #pragma unroll 1
+  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
+    sNodeSplits[i] = defVal;
+  }
+  __syncthreads();
+  int id = tid + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < len; id += stride) {
+    argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
+                      nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
+  }
+  __syncthreads();
+  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
+    Split s = sNodeSplits[i];
+    atomicArgMax(nodeSplits+i, s);
+  }
+}
+
+/**
+ * @brief Performs argmax_by_key functionality but for cases when keys need not
+ *  occur contiguously
+ * @param nodeSplits will contain information on best split for each node
+ * @param gradScans exclusive sum on sorted segments for each col
+ * @param gradSums gradient sum for each column in DMatrix based on to node-ids
+ * @param vals feature values
+ * @param colIds column index for each element in the feature values array
+ * @param nodeAssigns node-id assignments to each element in DMatrix
+ * @param nodes pointer to all nodes for this tree in BFS order
+ * @param nUniqKeys number of unique node-ids in this level
+ * @param nodeStart start index of the node-ids in this level
+ * @param len number of elements
+ * @param param training parameters
+ * @param algo which algorithm to use for argmax_by_key
+ */
+template <typename node_id_t, int BLKDIM=256, int ITEMS_PER_THREAD=4>
+void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
+                 const gpu_gpair* gradSums, const float* vals, const int* colIds,
+                 const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
+                 node_id_t nodeStart, int len, const TrainParam param,
+                 ArgMaxByKeyAlgo algo) {
+  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
+  int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
+  switch(algo) {
+  case ABK_GMEM:
+    atomicArgMaxByKeyGmem<node_id_t><<<nBlks,BLKDIM>>>
+        (nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+         nUniqKeys, nodeStart, len, param);
+    break;
+  case ABK_SMEM:
+    atomicArgMaxByKeySmem<node_id_t>
+        <<<nBlks,BLKDIM,sizeof(Split)*nUniqKeys>>>
+        (nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+         nUniqKeys, nodeStart, len, param);
+    break;
+  default:
+    throw std::runtime_error("argMaxByKey: Bad algo passed!");
+  };
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common.cuh"
+#include "gradients.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @struct Pair fused_scan_reduce_by_key.cuh
+ * @brief Pair used for key basd scan operations on gpu_gpair
+ */
+struct Pair {
+  int key;
+  gpu_gpair value;
+};
+
+/** define a key that's not used at all in the entire boosting process */
+static const int NONE_KEY = -100;
+
+/**
+ * @brief Allocate temporary buffers needed for scan operations
+ * @param tmpScans gradient buffer
+ * @param tmpKeys keys buffer
+ * @param size number of elements that will be scanned
+ */
+template <int BLKDIM_L1L3=256>
+int scanTempBufferSize(int size) {
+  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
+  return nBlks;
+}
+
+struct AddByKey {
+  template <typename T>
+  HOST_DEV_INLINE T operator()(const T &first, const T &second) const {
+    T result;
+    if (first.key == second.key) {
+      result.key = first.key;
+      result.value = first.value + second.value;
+    } else {
+      result.key = second.key;
+      result.value = second.value;
+    }
+    return result;
+  }
+};
+
+template <typename node_id_t, int BLKDIM_L1L3>
+__global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
+                               const int* instIds, gpu_gpair* mScans,
+                               int* mKeys, const node_id_t* keys, int nUniqKeys,
+                               const int* colIds, node_id_t nodeStart,
+                               const int size) {
+  Pair rootPair = {NONE_KEY, gpu_gpair(0.f, 0.f)};
+  int myKey;
+  gpu_gpair myValue;
+  typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  Pair threadData;
+  int tid = blockIdx.x*BLKDIM_L1L3 + threadIdx.x;
+  if (tid < size) {
+    myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
+    myValue = get(tid, vals, instIds);
+  } else {
+    myKey = NONE_KEY;
+    myValue = 0.f;
+  }
+  threadData.key   = myKey;
+  threadData.value = myValue;
+  // get previous key, especially needed for the last thread in this block
+  // in order to pass on the partial scan values.
+  // this statement MUST appear before the checks below!
+  // else, the result of this shuffle operation will be undefined
+  int previousKey = __shfl_up(myKey, 1);
+  // Collectively compute the block-wide exclusive prefix sum
+  BlockScan(temp_storage).ExclusiveScan(threadData, threadData, rootPair,
+                                        AddByKey());
+  if (tid < size) {
+    scans[tid] = threadData.value;
+  } else {
+    return;
+  }
+  if (threadIdx.x == BLKDIM_L1L3 - 1) {
+    threadData.value = (myKey == previousKey)? 
+                        threadData.value :
+                        gpu_gpair(0.0f, 0.0f);
+    mKeys[blockIdx.x]  = myKey;
+    mScans[blockIdx.x] = threadData.value + myValue;
+  }
+}
+
+template <int BLKSIZE>
+__global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) {
+  typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
+  Pair threadData;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  for (int i = threadIdx.x; i < mLength; i += BLKSIZE-1) {
+    threadData.key   =  mKeys[i];
+    threadData.value = mScans[i];
+    BlockScan(temp_storage).InclusiveScan(threadData, threadData,
+                                          AddByKey());
+    mScans[i] = threadData.value;
+    __syncthreads();
+  }
+}
+
+template <typename node_id_t, int BLKDIM_L1L3>
+__global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
+                               const gpu_gpair* vals, const int* instIds,
+                               const gpu_gpair* mScans, const int* mKeys,
+                               const node_id_t* keys, int nUniqKeys,
+                               const int* colIds, node_id_t nodeStart,
+                               const int size) {
+  int relId = threadIdx.x;
+  int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
+  // to avoid the following warning from nvcc:
+  //   __shared__ memory variable with non-empty constructor or destructor
+  //     (potential race between threads)
+  __shared__ char gradBuff[sizeof(gpu_gpair)];
+  __shared__ int s_mKeys;
+  gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
+  if(tid >= size)
+    return;
+  // cache block-wide partial scan info
+  if (relId == 0) {
+    s_mKeys = (blockIdx.x > 0)? mKeys[blockIdx.x-1] : NONE_KEY;
+    s_mScans[0] = (blockIdx.x > 0)? mScans[blockIdx.x-1] : gpu_gpair();
+  }
+  int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
+  int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid-1, keys, colIds,
+                                                      nodeStart, nUniqKeys);
+  gpu_gpair myValue = scans[tid];
+  __syncthreads();
+  if (blockIdx.x > 0 && s_mKeys == previousKey) {
+    myValue += s_mScans[0];
+  }
+  if (tid == size - 1) {
+    sums[previousKey] = myValue + get(tid, vals, instIds);
+  }
+  if ((previousKey != myKey) && (previousKey >= 0)) {
+    sums[previousKey] = myValue;
+    myValue = gpu_gpair(0.0f, 0.0f);
+  }
+  scans[tid] = myValue;
+}
+
+/**
+ * @brief Performs fused reduce and scan by key functionality. It is assumed that
+ *  the keys occur contiguously!
+ * @param sums the output gradient reductions for each element performed key-wise
+ * @param scans the output gradient scans for each element performed key-wise
+ * @param vals the gradients evaluated for each observation.
+ * @param instIds instance ids for each element
+ * @param keys keys to be used to segment the reductions. They need not occur
+ *  contiguously in contrast to scan_by_key. Currently, we need one key per
+ *  value in the 'vals' array.
+ * @param size number of elements in the 'vals' array
+ * @param nUniqKeys max number of uniq keys found per column
+ * @param nCols number of columns
+ * @param tmpScans temporary scan buffer needed for cub-pyramid algo
+ * @param tmpKeys temporary key buffer needed for cub-pyramid algo
+ * @param colIds column indices for each element in the array
+ * @param nodeStart index of the leftmost node in the current level
+ */
+template <typename node_id_t, int BLKDIM_L1L3=256, int BLKDIM_L2=512>
+void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
+                     const int* instIds, const node_id_t* keys, int size,
+                     int nUniqKeys, int nCols, gpu_gpair* tmpScans,
+                     int* tmpKeys, const int* colIds, node_id_t nodeStart) {
+  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
+  cudaMemset(sums, 0, nUniqKeys*nCols*sizeof(gpu_gpair));
+  cubScanByKeyL1<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
+      (scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
+       nodeStart, size);
+  cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
+  cubScanByKeyL3<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
+      (sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
+       nodeStart, size);
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/exact/gpu_builder.cuh
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/tree/param.h"
+#include "xgboost/tree_updater.h"
+#include "cub/cub.cuh"
+#include "../common.cuh"
+#include <vector>
+#include "loss_functions.cuh"
+#include "gradients.cuh"
+#include "node.cuh"
+#include "argmax_by_key.cuh"
+#include "split2node.cuh"
+#include "fused_scan_reduce_by_key.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+template <typename node_id_t>
+__global__ void initRootNode(Node<node_id_t>* nodes, const gpu_gpair* sums,
+                             const TrainParam param) {
+  // gradients already evaluated inside transferGrads
+  Node<node_id_t> n;
+  n.gradSum = sums[0];
+  n.score = CalcGain(param, n.gradSum.g, n.gradSum.h);
+  n.weight = CalcWeight(param, n.gradSum.g, n.gradSum.h);
+  n.id = 0;
+  nodes[0] = n;
+}
+
+template <typename node_id_t>
+__global__ void assignColIds(int* colIds, const int* colOffsets) {
+  int myId = blockIdx.x;
+  int start = colOffsets[myId];
+  int end = colOffsets[myId+1];
+  for (int id = start+threadIdx.x; id < end; id += blockDim.x) {
+    colIds[id] = myId;
+  }
+}
+
+template <typename node_id_t>
+__global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
+                                   const Node<node_id_t>* nodes, int nRows) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (id >= nRows) {
+    return;
+  }
+  // if this element belongs to none of the currently active node-id's
+  node_id_t nId = nodeIdsPerInst[id];
+  if (nId == UNUSED_NODE) {
+    return;
+  }
+  const Node<node_id_t> n = nodes[nId];
+  node_id_t result;
+  if (n.isLeaf() || n.isUnused()) {
+    result = UNUSED_NODE;
+  } else if(n.isDefaultLeft()) {
+    result = (2 * n.id) + 1;
+  } else {
+    result = (2 * n.id) + 2;
+  }
+  nodeIdsPerInst[id] = result;
+}
+
+template <typename node_id_t>
+__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
+                              const node_id_t* nodeIds, const int* instId,
+                              const Node<node_id_t>* nodes, const int* colOffsets,
+                              const float* vals, int nVals, int nCols) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < nVals; id += stride) {
+    // fusing generation of indices for node locations
+    nodeLocations[id] = id;
+    // using nodeIds here since the previous kernel would have updated
+    // the nodeIdsPerInst with all default assignments
+    int nId = nodeIds[id];
+    // if this element belongs to none of the currently active node-id's
+    if (nId != UNUSED_NODE) {
+      const Node<node_id_t> n = nodes[nId];
+      int colId = n.colIdx;
+      //printf("nid=%d colId=%d id=%d\n", nId, colId, id);
+      int start = colOffsets[colId];
+      int end = colOffsets[colId + 1];
+      ///@todo: too much wasteful threads!!
+      if ((id >= start) && (id < end) && !(n.isLeaf() || n.isUnused())) {
+        node_id_t result = (2 * n.id) + 1 + (vals[id] >= n.threshold);
+        nodeIdsPerInst[instId[id]] = result;
+      }
+    }
+  }
+}
+
+template <typename node_id_t>
+__global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
+  int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if ((id < len) && !nodes[id].isUnused()) {
+    int lid = (id << 1) + 1;
+    int rid = (id << 1) + 2;
+    if ((lid >= len) || (rid >= len)) {
+      nodes[id].score = -FLT_MAX; // bottom-most nodes
+    } else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
+      nodes[id].score = -FLT_MAX; // unused child nodes
+    }
+  }
+}
+
+// unit test forward declaration for friend function access
+template <typename node_id_t> void testSmallData();
+template <typename node_id_t> void testLargeData();
+template <typename node_id_t> void testAllocate();
+template <typename node_id_t> void testMarkLeaves();
+template <typename node_id_t> void testDense2Sparse();
+template <typename node_id_t> class GPUBuilder;
+template <typename node_id_t>
+std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
+    const std::string& file,
+    xgboost::tree::exact::GPUBuilder<node_id_t>& builder);
+
+template <typename node_id_t>
+class GPUBuilder {
+ public:
+  GPUBuilder(): allocated(false) {}
+
+  ~GPUBuilder() {}
+
+  void Init(const TrainParam& p) {
+    param = p;
+    maxNodes = (1 << (param.max_depth + 1)) - 1;
+    maxLeaves = 1 << param.max_depth;
+  }
+
+  void UpdateParam(const TrainParam &param) { this->param = param; }
+
+  /// @note: Update should be only after Init!!
+  void Update(const std::vector<bst_gpair>& gpair, DMatrix *hMat,
+              RegTree* hTree) {
+    if (!allocated) {
+      setupOneTimeData(*hMat);
+    }
+    for (int i = 0; i < param.max_depth; ++i) {
+      if (i == 0) {
+        // make sure to start on a fresh tree with sorted values!
+        vals.current_dvec() = vals_cached;
+        instIds.current_dvec() = instIds_cached;
+        transferGrads(gpair);
+      }
+      int nNodes = 1 << i;
+      node_id_t nodeStart = nNodes - 1;
+      initNodeData(i, nodeStart, nNodes);
+      findSplit(i, nodeStart, nNodes);
+    }
+    // mark all the used nodes with unused children as leaf nodes
+    markLeaves();
+    dense2sparse(*hTree);
+  }
+
+private:
+  friend void testSmallData<node_id_t>();
+  friend void testLargeData<node_id_t>();
+  friend void testAllocate<node_id_t>();
+  friend void testMarkLeaves<node_id_t>();
+  friend void testDense2Sparse<node_id_t>();
+  friend std::shared_ptr<xgboost::DMatrix> setupGPUBuilder<node_id_t>(
+      const std::string& file, GPUBuilder<node_id_t>& builder);
+
+  TrainParam param;
+  /** whether we have initialized memory already (so as not to repeat!) */
+  bool allocated;
+  /** feature values stored in column-major compressed format */
+  dh::dvec2<float> vals;
+  dh::dvec<float> vals_cached;
+  /** corresponding instance id's of these featutre values */
+  dh::dvec2<int> instIds;
+  dh::dvec<int> instIds_cached;
+  /** column offsets for these feature values */
+  dh::dvec<int> colOffsets;
+  dh::dvec<gpu_gpair> gradsInst;
+  dh::dvec2<node_id_t> nodeAssigns;
+  dh::dvec2<int> nodeLocations;
+  dh::dvec<Node<node_id_t> > nodes;
+  dh::dvec<node_id_t> nodeAssignsPerInst;
+  dh::dvec<gpu_gpair> gradSums;
+  dh::dvec<gpu_gpair> gradScans;
+  dh::dvec<Split> nodeSplits;
+  int nVals;
+  int nRows;
+  int nCols;
+  int maxNodes;
+  int maxLeaves;
+  dh::CubMemory tmp_mem;
+  dh::dvec<gpu_gpair> tmpScanGradBuff;
+  dh::dvec<int> tmpScanKeyBuff;
+  dh::dvec<int> colIds;
+  dh::bulk_allocator ba;
+
+  void findSplit(int level, node_id_t nodeStart, int nNodes) {
+    reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
+                    instIds.current(), nodeAssigns.current(), nVals, nNodes,
+                    nCols, tmpScanGradBuff.data(), tmpScanKeyBuff.data(),
+                    colIds.data(), nodeStart);
+    argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
+                vals.current(), colIds.data(), nodeAssigns.current(),
+                nodes.data(), nNodes, nodeStart, nVals, param,
+                level<=MAX_ABK_LEVELS? ABK_SMEM : ABK_GMEM);
+    split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
+               gradSums.data(), vals.current(), colIds.data(), colOffsets.data(),
+               nodeAssigns.current(), nNodes, nodeStart, nCols, param);
+  }
+
+  void allocateAllData(int offsetSize) {
+    int tmpBuffSize = scanTempBufferSize(nVals);
+    ba.allocate(&vals, nVals,
+                &vals_cached, nVals,
+                &instIds, nVals,
+                &instIds_cached, nVals,
+                &colOffsets, offsetSize,
+                &gradsInst, nRows,
+                &nodeAssigns, nVals,
+                &nodeLocations, nVals,
+                &nodes, maxNodes,
+                &nodeAssignsPerInst, nRows,
+                &gradSums, maxLeaves*nCols,
+                &gradScans, nVals,
+                &nodeSplits, maxLeaves,
+                &tmpScanGradBuff, tmpBuffSize,
+                &tmpScanKeyBuff, tmpBuffSize,
+                &colIds, nVals);
+  }
+
+  void setupOneTimeData(DMatrix& hMat) {
+    size_t free_memory = dh::available_memory();
+    if (!hMat.SingleColBlock()) {
+      throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
+    }
+    std::vector<float> fval;
+    std::vector<int> fId, offset;
+    convertToCsc(hMat, fval, fId, offset);
+    allocateAllData((int)offset.size());
+    transferAndSortData(fval, fId, offset);
+    allocated = true;
+    if (!param.silent) {
+      const int mb_size = 1048576;
+      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
+                   << free_memory / mb_size << " MB on " << dh::device_name();
+    }
+  }
+
+  void convertToCsc(DMatrix& hMat, std::vector<float>& fval,
+                    std::vector<int>& fId, std::vector<int>& offset) {
+    MetaInfo info = hMat.info();
+    nRows = info.num_row;
+    nCols = info.num_col;
+    offset.reserve(nCols + 1);
+    offset.push_back(0);
+    fval.reserve(nCols * nRows);
+    fId.reserve(nCols * nRows);
+    // in case you end up with a DMatrix having no column access
+    // then make sure to enable that before copying the data!
+    if (!hMat.HaveColAccess()) {
+      const std::vector<bool> enable(nCols, true);
+      hMat.InitColAccess(enable, 1, nRows);
+    }
+    dmlc::DataIter<ColBatch>* iter = hMat.ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch& batch = iter->Value();
+      for (int i=0;i<batch.size;i++) {
+        const ColBatch::Inst& col = batch[i];
+        for (const ColBatch::Entry* it=col.data;it!=col.data+col.length;it++) {
+          int inst_id = static_cast<int>(it->index);
+          fval.push_back(it->fvalue);
+          fId.push_back(inst_id);
+        }
+        offset.push_back(fval.size());
+      }
+    }
+    nVals = fval.size();
+  }
+
+  void transferAndSortData(const std::vector<float>& fval,
+                           const std::vector<int>& fId,
+                           const std::vector<int>& offset) {
+    vals.current_dvec() = fval;
+    instIds.current_dvec() = fId;
+    colOffsets = offset;
+    segmentedSort<float,int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
+    vals_cached = vals.current_dvec();
+    instIds_cached = instIds.current_dvec();
+    assignColIds<node_id_t><<<nCols,512>>>(colIds.data(), colOffsets.data());
+  }
+
+  void transferGrads(const std::vector<bst_gpair>& gpair) {
+    // HACK
+    dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
+                             sizeof(gpu_gpair)*nRows, cudaMemcpyHostToDevice));
+    // evaluate the full-grad reduction for the root node
+    sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
+  }
+
+  void initNodeData(int level, node_id_t nodeStart, int nNodes) {
+    // all instances belong to root node at the beginning!
+    if (level == 0) {
+      nodes.fill(Node<node_id_t>());
+      nodeAssigns.current_dvec().fill(0);
+      nodeAssignsPerInst.fill(0);
+      // for root node, just update the gradient/score/weight/id info
+      // before splitting it! Currently all data is on GPU, hence this
+      // stupid little kernel
+      initRootNode<<<1,1>>>(nodes.data(), gradSums.data(), param);
+    } else {
+      const int BlkDim = 256;
+      const int ItemsPerThread = 4;
+      // assign default node ids first
+      int nBlks = dh::div_round_up(nRows, BlkDim);
+      fillDefaultNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
+                                           nodes.data(), nRows);
+      // evaluate the correct child indices of non-missing values next
+      nBlks = dh::div_round_up(nVals, BlkDim*ItemsPerThread);
+      assignNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
+                                      nodeLocations.current(),
+                                      nodeAssigns.current(),
+                                      instIds.current(), nodes.data(),
+                                      colOffsets.data(), vals.current(),
+                                      nVals, nCols);
+      // gather the node assignments across all other columns too
+      gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
+                        instIds.current(), nVals);
+      sortKeys(level);
+    }
+  }
+
+  void sortKeys(int level) {
+    // segmented-sort the arrays based on node-id's
+    // but we don't need more than level+1 bits for sorting!
+    segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
+                  0, level+1);
+    gather<float,int>(vals.other(), vals.current(), instIds.other(),
+                      instIds.current(), nodeLocations.current(), nVals);
+    vals.buff().selector ^= 1;
+    instIds.buff().selector ^= 1;
+  }
+
+  void markLeaves() {
+    const int BlkDim = 128;
+    int nBlks = dh::div_round_up(maxNodes, BlkDim);
+    markLeavesKernel<<<nBlks,BlkDim>>>(nodes.data(), maxNodes);
+  }
+
+  void dense2sparse(RegTree &tree) {
+    std::vector<Node<node_id_t> > hNodes = nodes.as_vector();
+    int nodeId = 0;
+    for (int i = 0; i < maxNodes; ++i) {
+      const Node<node_id_t>& n = hNodes[i];
+      if ((i != 0) && hNodes[i].isLeaf()) {
+        tree[nodeId].set_leaf(n.weight * param.learning_rate);
+        tree.stat(nodeId).sum_hess = n.gradSum.h;
+        ++nodeId;
+      } else if (!hNodes[i].isUnused()) {
+        tree.AddChilds(nodeId);
+        tree[nodeId].set_split(n.colIdx, n.threshold, n.dir==LeftDir);
+        tree.stat(nodeId).loss_chg = n.score;
+        tree.stat(nodeId).sum_hess = n.gradSum.h;
+        tree.stat(nodeId).base_weight = n.weight;
+        tree[tree[nodeId].cleft()].set_leaf(0);
+        tree[tree[nodeId].cright()].set_leaf(0);
+        ++nodeId;
+      }
+    }
+  }
+};
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/gradients.cuh
+++ b/plugin/updater_gpu/src/exact/gradients.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @struct gpu_gpair gradients.cuh
+ * @brief The first/second order gradients for iteratively building the tree
+ */
+struct gpu_gpair {
+  /** the 'g_i' as it appears in the xgboost paper */
+  float g;
+  /** the 'h_i' as it appears in the xgboost paper */
+  float h;
+
+  HOST_DEV_INLINE gpu_gpair(): g(0.f), h(0.f) {}
+  HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h): g(_g), h(_h) {}
+  HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a): g(a.g), h(a.h) {}
+
+  /**
+   * @brief Checks whether the hessian is more than the defined weight
+   * @param minWeight minimum weight to be compared against
+   * @return true if the hessian is greater than the minWeight
+   * @note this is useful in deciding whether to further split to child node
+   */
+  HOST_DEV_INLINE bool isSplittable(float minWeight) const {
+    return (h > minWeight);
+  }
+
+  HOST_DEV_INLINE gpu_gpair& operator+=(const gpu_gpair& a) {
+    g += a.g;
+    h += a.h;
+    return *this;
+  }
+
+  HOST_DEV_INLINE gpu_gpair& operator-=(const gpu_gpair& a) {
+    g -= a.g;
+    h -= a.h;
+    return *this;
+  }
+
+  HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
+                                             const gpu_gpair& b) {
+    return gpu_gpair(a.g+b.g, a.h+b.h);
+  }
+
+  HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
+                                             const gpu_gpair& b) {
+    return gpu_gpair(a.g-b.g, a.h-b.h);
+  }
+
+  HOST_DEV_INLINE gpu_gpair(int value) {
+    *this = gpu_gpair((float)value, (float)value);
+  }
+};
+
+
+/**
+ * @brief Gradient value getter function
+ * @param id the index into the vals or instIds array to which to fetch
+ * @param vals the gradient value buffer
+ * @param instIds instance index buffer
+ * @return the expected gradient value
+ */
+HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, const int* instIds) {
+  id = instIds[id];
+  return vals[id];
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/loss_functions.cuh
+++ b/plugin/updater_gpu/src/exact/loss_functions.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common.cuh"
+#include "gradients.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+HOST_DEV_INLINE float device_calc_loss_chg(const TrainParam &param,
+                                           const gpu_gpair &scan,
+                                           const gpu_gpair &missing,
+                                           const gpu_gpair &parent_sum,
+                                           const float &parent_gain,
+                                           bool missing_left) {
+  gpu_gpair left = scan;
+  if (missing_left) {
+    left += missing;
+  }
+  gpu_gpair right = parent_sum - left;
+  float left_gain = CalcGain(param, left.g, left.h);
+  float right_gain = CalcGain(param, right.g, right.h);
+  return left_gain + right_gain - parent_gain;
+}
+
+HOST_DEV_INLINE float loss_chg_missing(const gpu_gpair &scan,
+                                       const gpu_gpair &missing,
+                                       const gpu_gpair &parent_sum,
+                                       const float &parent_gain,
+                                       const TrainParam &param,
+                                       bool &missing_left_out) {
+  float missing_left_loss =
+      device_calc_loss_chg(param, scan, missing, parent_sum, parent_gain, true);
+  float missing_right_loss = device_calc_loss_chg(
+      param, scan, missing, parent_sum, parent_gain, false);
+  if (missing_left_loss >= missing_right_loss) {
+    missing_left_out = true;
+    return missing_left_loss;
+  } else {
+    missing_left_out = false;
+    return missing_right_loss;
+  }
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/node.cuh
+++ b/plugin/updater_gpu/src/exact/node.cuh
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "gradients.cuh"
+#include "../common.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @enum DefaultDirection node.cuh
+ * @brief Default direction to be followed in case of missing values
+ */
+enum DefaultDirection {
+  /** move to left child */
+  LeftDir = 0,
+  /** move to right child */
+  RightDir
+};
+
+
+/** used to assign default id to a Node */
+static const int UNUSED_NODE = -1;
+
+
+/**
+ * @struct Split node.cuh
+ * @brief Abstraction of a possible split in the decision tree
+ */
+struct Split {
+  /** the optimal gain score for this node */
+  float score;
+  /** index where to split in the DMatrix */
+  int index;
+
+  HOST_DEV_INLINE Split(): score(-FLT_MAX), index(INT_MAX) {}
+
+  /**
+   * @brief Whether the split info is valid to be used to create a new child
+   * @param minSplitLoss minimum score above which decision to split is made
+   * @return true if splittable, else false
+   */
+  HOST_DEV_INLINE bool isSplittable(float minSplitLoss) const {
+    return ((score >= minSplitLoss) && (index != INT_MAX));
+  }
+};
+
+
+/**
+ * @struct Node node.cuh
+ * @brief Abstraction of a node in the decision tree
+ */
+template <typename node_id_t>
+class Node {
+ public:
+  /** sum of gradients across all training samples part of this node */
+  gpu_gpair gradSum;
+  /** the optimal score for this node */
+  float score;
+  /** weightage for this node */
+  float weight;
+  /** default direction for missing values */
+  DefaultDirection dir;
+  /** threshold value for comparison */
+  float threshold;
+  /** column (feature) index whose value needs to be compared in this node */
+  int colIdx;
+  /** node id (used as key for reduce/scan) */
+  node_id_t id;
+
+  HOST_DEV_INLINE Node(): gradSum(), score(-FLT_MAX), weight(-FLT_MAX),
+                          dir(LeftDir), threshold(0.f), colIdx(UNUSED_NODE),
+                          id(UNUSED_NODE) {}
+
+  /** Tells whether this node is part of the decision tree */
+  HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
+
+  /** Tells whether this node is a leaf of the decision tree */
+  HOST_DEV_INLINE bool isLeaf() const {
+    return (!isUnused() && (score == -FLT_MAX));
+  }
+
+  /** Tells whether default direction is left child or not */
+  HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
+};
+
+
+/**
+ * @struct Segment node.cuh
+ * @brief Space inefficient, but super easy to implement structure to define
+ *   the start and end of a segment in the input array
+ */
+struct Segment {
+  /** start index of the segment */
+  int start;
+  /** end index of the segment */
+  int end;
+
+  HOST_DEV_INLINE Segment(): start(-1), end(-1) {}
+
+  /** Checks whether the current structure defines a valid segment */
+  HOST_DEV_INLINE bool isValid() const {
+    return !((start == -1) || (end == -1));
+  }
+};
+
+
+/**
+ * @enum NodeType node.cuh
+ * @brief Useful to decribe the node type in a dense BFS-order tree array
+ */
+enum NodeType {
+  /** a non-leaf node */
+  NODE = 0,
+  /** leaf node */
+  LEAF,
+  /** unused node */
+  UNUSED
+};
+
+
+/**
+ * @brief Absolute BFS order IDs to col-wise unique IDs based on user input
+ * @param tid the index of the element that this thread should access
+ * @param abs the array of absolute IDs
+ * @param colIds the array of column IDs for each element
+ * @param nodeStart the start of the node ID at this level
+ * @param nKeys number of nodes at this level.
+ * @return the uniq key
+ */
+template <typename node_id_t>
+HOST_DEV_INLINE int abs2uniqKey(int tid, const node_id_t* abs,
+                                const int* colIds, node_id_t nodeStart,
+                                int nKeys) {
+  int a = abs[tid];
+  if (a == UNUSED_NODE) return a;
+  return ((a - nodeStart) + (colIds[tid] * nKeys));
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/split2node.cuh
+++ b/plugin/updater_gpu/src/exact/split2node.cuh
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/tree/param.h"
+#include "gradients.cuh"
+#include "node.cuh"
+#include "loss_functions.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @brief Helper function to update the child node based on the current status
+ *  of its parent node
+ * @param nodes the nodes array in which the position at 'nid' will be updated
+ * @param nid the nodeId in the 'nodes' array corresponding to this child node
+ * @param grad gradient sum for this child node
+ * @param minChildWeight minimum child weight for the split
+ * @param alpha L1 regularizer for weight updates
+ * @param lambda lambda as in xgboost
+ * @param maxStep max weight step update
+ */
+template <typename node_id_t>
+DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
+                                   const gpu_gpair& grad,
+                                   const TrainParam &param) {
+  nodes[nid].gradSum = grad;
+  nodes[nid].score = CalcGain(param, grad.g, grad.h);
+  nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
+  nodes[nid].id = nid;
+}
+
+/**
+ * @brief Helper function to update the child nodes based on the current status
+ *  of their parent node
+ * @param nodes the nodes array in which the position at 'nid' will be updated
+ * @param pid the nodeId of the parent
+ * @param gradL gradient sum for the left child node
+ * @param gradR gradient sum for the right child node
+ * @param param the training parameter struct
+ */
+template <typename node_id_t>
+DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
+                                 const gpu_gpair& gradL, const gpu_gpair& gradR,
+                                 const TrainParam &param) {
+  int childId = (pid * 2) + 1;
+  updateOneChildNode(nodes, childId, gradL, param);
+  updateOneChildNode(nodes, childId+1, gradR, param);
+}
+
+template <typename node_id_t>
+DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
+                                      const Node<node_id_t>& n, int absNodeId, int colId,
+                                      const gpu_gpair& gradScan,
+                                      const gpu_gpair& colSum, float thresh,
+                                      const TrainParam &param) {
+  bool missingLeft = true;
+  // get the default direction for the current node
+  gpu_gpair missing = n.gradSum - colSum;
+  loss_chg_missing(gradScan, missing, n.gradSum, n.score, param, missingLeft);
+  // get the score/weight/id/gradSum for left and right child nodes
+  gpu_gpair lGradSum, rGradSum;
+  if (missingLeft) {
+    lGradSum = gradScan + n.gradSum - colSum;
+  } else {
+    lGradSum = gradScan;
+  }
+  rGradSum = n.gradSum - lGradSum;
+  updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
+  // update default-dir, threshold and feature id for current node
+  nodes[absNodeId].dir = missingLeft? LeftDir : RightDir;
+  nodes[absNodeId].colIdx = colId;
+  nodes[absNodeId].threshold = thresh;
+}
+
+template <typename node_id_t, int BLKDIM=256>
+__global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits,
+                                 const gpu_gpair* gradScans,
+                                 const gpu_gpair* gradSums, const float* vals,
+                                 const int* colIds, const int* colOffsets,
+                                 const node_id_t* nodeAssigns, int nUniqKeys,
+                                 node_id_t nodeStart, int nCols,
+                                 const TrainParam param) {
+  int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (uid >= nUniqKeys) {
+    return;
+  }
+  int absNodeId = uid + nodeStart;
+  Split s = nodeSplits[uid];
+  if (s.isSplittable(param.min_split_loss)) {
+    int idx = s.index;
+    int nodeInstId = abs2uniqKey(idx, nodeAssigns, colIds, nodeStart,
+                                 nUniqKeys);
+    updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId,
+                          colIds[idx], gradScans[idx],
+                          gradSums[nodeInstId], vals[idx], param);
+  } else {
+    // cannot be split further, so this node is a leaf!
+    nodes[absNodeId].score = -FLT_MAX;
+  }
+}
+
+/**
+ * @brief function to convert split information into node
+ * @param nodes the output nodes
+ * @param nodeSplits split information
+ * @param gradScans scan of sorted gradients across columns
+ * @param gradSums key-wise gradient reduction across columns
+ * @param vals the feature values
+ * @param colIds column indices for each element in the array
+ * @param colOffsets column segment offsets
+ * @param nodeAssigns node-id assignment to every feature value
+ * @param nUniqKeys number of nodes that we are currently working on
+ * @param nodeStart start offset of the nodes in the overall BFS tree
+ * @param nCols number of columns
+ * @param preUniquifiedKeys whether to uniquify the keys from inside kernel or not
+ * @param param the training parameter struct
+ */
+template <typename node_id_t, int BLKDIM=256>
+void split2node(Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
+                const gpu_gpair* gradSums, const float* vals, const int* colIds,
+                const int* colOffsets, const node_id_t* nodeAssigns,
+                int nUniqKeys, node_id_t nodeStart, int nCols,
+                const TrainParam param) {
+  int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
+  split2nodeKernel<<<nBlks,BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
+                                     vals, colIds, colOffsets, nodeAssigns,
+                                     nUniqKeys, nodeStart, nCols,
+                                     param);
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost