[GPU-Plugin] Multi-GPU gpu_id bug fixes for grow_gpu_hist and grow_gpu methods, and additional documentation for the gpu plugin. (#2463)

2017-06-30 01:04:17 -07:00
parent 91dae84a00
commit 6b287177c8
21 changed files with 578 additions and 449 deletions
--- a/plugin/updater_gpu/src/exact/argmax_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/argmax_by_key.cuh
@@ -17,8 +17,8 @@

 #include "../../../../src/tree/param.h"
 #include "../common.cuh"
-#include "node.cuh"
 #include "loss_functions.cuh"
+#include "node.cuh"

 namespace xgboost {
 namespace tree {
@@ -45,7 +45,7 @@ HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
    out.index = b.index;
  } else if (a.score == b.score) {
    out.score = a.score;
-    out.index = (a.index < b.index)? a.index : b.index;
+    out.index = (a.index < b.index) ? a.index : b.index;
  } else {
    out.score = a.score;
    out.index = a.index;
@@ -54,7 +54,7 @@ HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
 }

 DEV_INLINE void atomicArgMax(Split* address, Split val) {
-  unsigned long long* intAddress = (unsigned long long*) address;
+  unsigned long long* intAddress = (unsigned long long*)address;
  unsigned long long old = *intAddress;
  unsigned long long assumed;
  do {
@@ -65,23 +65,19 @@ DEV_INLINE void atomicArgMax(Split* address, Split val) {
 }

 template <typename node_id_t>
-DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
-                                  const gpu_gpair* gradScans,
-                                  const gpu_gpair* gradSums, const float* vals,
-                                  const int* colIds,
-                                  const node_id_t* nodeAssigns,
-                                  const Node<node_id_t>* nodes, int nUniqKeys,
-                                  node_id_t nodeStart, int len,
-                                  const TrainParam &param) {
+DEV_INLINE void argMaxWithAtomics(
+    int id, Split* nodeSplits, const gpu_gpair* gradScans,
+    const gpu_gpair* gradSums, const float* vals, const int* colIds,
+    const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
+    node_id_t nodeStart, int len, const TrainParam& param) {
  int nodeId = nodeAssigns[id];
  ///@todo: this is really a bad check! but will be fixed when we move
  ///   to key-based reduction
-  if ((id == 0) || !((nodeId == nodeAssigns[id-1]) &&
-                     (colIds[id] == colIds[id-1]) &&
-                     (vals[id] == vals[id-1]))) {
+  if ((id == 0) ||
+      !((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
+        (vals[id] == vals[id - 1]))) {
    if (nodeId != UNUSED_NODE) {
-      int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart,
-                              nUniqKeys);
+      int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
      gpu_gpair colSum = gradSums[sumId];
      int uid = nodeId - nodeStart;
      Node<node_id_t> n = nodes[nodeId];
@@ -90,23 +86,20 @@ DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
      bool tmp;
      Split s;
      gpu_gpair missing = parentSum - colSum;
-      s.score = loss_chg_missing(gradScans[id], missing, parentSum,
-                                 parentGain, param, tmp);
+      s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain,
+                                 param, tmp);
      s.index = id;
-      atomicArgMax(nodeSplits+uid, s);
-    } // end if nodeId != UNUSED_NODE
-  } // end if id == 0 ...
+      atomicArgMax(nodeSplits + uid, s);
+    }  // end if nodeId != UNUSED_NODE
+  }    // end if id == 0 ...
 }

 template <typename node_id_t>
-__global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
-                                      const gpu_gpair* gradScans,
-                                      const gpu_gpair* gradSums,
-                                      const float* vals, const int* colIds,
-                                      const node_id_t* nodeAssigns,
-                                      const Node<node_id_t>* nodes, int nUniqKeys,
-                                      node_id_t nodeStart, int len,
-                                      const TrainParam param) {
+__global__ void atomicArgMaxByKeyGmem(
+    Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums,
+    const float* vals, const int* colIds, const node_id_t* nodeAssigns,
+    const Node<node_id_t>* nodes, int nUniqKeys, node_id_t nodeStart, int len,
+    const TrainParam param) {
  int id = threadIdx.x + (blockIdx.x * blockDim.x);
  const int stride = blockDim.x * gridDim.x;
  for (; id < len; id += stride) {
@@ -116,19 +109,16 @@ __global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
 }

 template <typename node_id_t>
-__global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
-                                      const gpu_gpair* gradScans,
-                                      const gpu_gpair* gradSums,
-                                      const float* vals, const int* colIds,
-                                      const node_id_t* nodeAssigns,
-                                      const Node<node_id_t>* nodes, int nUniqKeys,
-                                      node_id_t nodeStart, int len,
-                                      const TrainParam param) {
+__global__ void atomicArgMaxByKeySmem(
+    Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums,
+    const float* vals, const int* colIds, const node_id_t* nodeAssigns,
+    const Node<node_id_t>* nodes, int nUniqKeys, node_id_t nodeStart, int len,
+    const TrainParam param) {
  extern __shared__ char sArr[];
  Split* sNodeSplits = (Split*)sArr;
  int tid = threadIdx.x;
  Split defVal;
-  #pragma unroll 1
+#pragma unroll 1
  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
    sNodeSplits[i] = defVal;
  }
@@ -142,7 +132,7 @@ __global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
  __syncthreads();
  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
    Split s = sNodeSplits[i];
-    atomicArgMax(nodeSplits+i, s);
+    atomicArgMax(nodeSplits + i, s);
  }
 }

@@ -162,28 +152,30 @@ __global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
 * @param param training parameters
 * @param algo which algorithm to use for argmax_by_key
 */
-template <typename node_id_t, int BLKDIM=256, int ITEMS_PER_THREAD=4>
+template <typename node_id_t, int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
 void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
-                 const gpu_gpair* gradSums, const float* vals, const int* colIds,
-                 const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
+                 const gpu_gpair* gradSums, const float* vals,
+                 const int* colIds, const node_id_t* nodeAssigns,
+                 const Node<node_id_t>* nodes, int nUniqKeys,
                 node_id_t nodeStart, int len, const TrainParam param,
                 ArgMaxByKeyAlgo algo) {
-  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
-  int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
-  switch(algo) {
-  case ABK_GMEM:
-    atomicArgMaxByKeyGmem<node_id_t><<<nBlks,BLKDIM>>>
-        (nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
-         nUniqKeys, nodeStart, len, param);
-    break;
-  case ABK_SMEM:
-    atomicArgMaxByKeySmem<node_id_t>
-        <<<nBlks,BLKDIM,sizeof(Split)*nUniqKeys>>>
-        (nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
-         nUniqKeys, nodeStart, len, param);
-    break;
-  default:
-    throw std::runtime_error("argMaxByKey: Bad algo passed!");
+  fillConst<Split, BLKDIM, ITEMS_PER_THREAD>(dh::get_device_idx(param.gpu_id),
+                                             nodeSplits, nUniqKeys, Split());
+  int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD * BLKDIM);
+  switch (algo) {
+    case ABK_GMEM:
+      atomicArgMaxByKeyGmem<node_id_t><<<nBlks, BLKDIM>>>(
+          nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+          nUniqKeys, nodeStart, len, param);
+      break;
+    case ABK_SMEM:
+      atomicArgMaxByKeySmem<
+          node_id_t><<<nBlks, BLKDIM, sizeof(Split) * nUniqKeys>>>(
+          nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+          nUniqKeys, nodeStart, len, param);
+      break;
+    default:
+      throw std::runtime_error("argMaxByKey: Bad algo passed!");
  };
 }

--- a/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
@@ -18,7 +18,6 @@
 #include "../common.cuh"
 #include "gradients.cuh"

-
 namespace xgboost {
 namespace tree {
 namespace exact {
@@ -41,7 +40,7 @@ static const int NONE_KEY = -100;
 * @param tmpKeys keys buffer
 * @param size number of elements that will be scanned
 */
-template <int BLKDIM_L1L3=256>
+template <int BLKDIM_L1L3 = 256>
 int scanTempBufferSize(int size) {
  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
  return nBlks;
@@ -49,7 +48,7 @@ int scanTempBufferSize(int size) {

 struct AddByKey {
  template <typename T>
-  HOST_DEV_INLINE T operator()(const T &first, const T &second) const {
+  HOST_DEV_INLINE T operator()(const T& first, const T& second) const {
    T result;
    if (first.key == second.key) {
      result.key = first.key;
@@ -74,7 +73,7 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
  typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
  __shared__ typename BlockScan::TempStorage temp_storage;
  Pair threadData;
-  int tid = blockIdx.x*BLKDIM_L1L3 + threadIdx.x;
+  int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
  if (tid < size) {
    myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
    myValue = get(tid, vals, instIds);
@@ -82,7 +81,7 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
    myKey = NONE_KEY;
    myValue = 0.f;
  }
-  threadData.key   = myKey;
+  threadData.key = myKey;
  threadData.value = myValue;
  // get previous key, especially needed for the last thread in this block
  // in order to pass on the partial scan values.
@@ -90,18 +89,17 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
  // else, the result of this shuffle operation will be undefined
  int previousKey = __shfl_up(myKey, 1);
  // Collectively compute the block-wide exclusive prefix sum
-  BlockScan(temp_storage).ExclusiveScan(threadData, threadData, rootPair,
-                                        AddByKey());
+  BlockScan(temp_storage)
+      .ExclusiveScan(threadData, threadData, rootPair, AddByKey());
  if (tid < size) {
    scans[tid] = threadData.value;
  } else {
    return;
  }
  if (threadIdx.x == BLKDIM_L1L3 - 1) {
-    threadData.value = (myKey == previousKey)? 
-                        threadData.value :
-                        gpu_gpair(0.0f, 0.0f);
-    mKeys[blockIdx.x]  = myKey;
+    threadData.value =
+        (myKey == previousKey) ? threadData.value : gpu_gpair(0.0f, 0.0f);
+    mKeys[blockIdx.x] = myKey;
    mScans[blockIdx.x] = threadData.value + myValue;
  }
 }
@@ -111,11 +109,10 @@ __global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) {
  typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
  Pair threadData;
  __shared__ typename BlockScan::TempStorage temp_storage;
-  for (int i = threadIdx.x; i < mLength; i += BLKSIZE-1) {
-    threadData.key   =  mKeys[i];
+  for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
+    threadData.key = mKeys[i];
    threadData.value = mScans[i];
-    BlockScan(temp_storage).InclusiveScan(threadData, threadData,
-                                          AddByKey());
+    BlockScan(temp_storage).InclusiveScan(threadData, threadData, AddByKey());
    mScans[i] = threadData.value;
    __syncthreads();
  }
@@ -136,15 +133,14 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
  __shared__ char gradBuff[sizeof(gpu_gpair)];
  __shared__ int s_mKeys;
  gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
-  if(tid >= size)
-    return;
+  if (tid >= size) return;
  // cache block-wide partial scan info
  if (relId == 0) {
-    s_mKeys = (blockIdx.x > 0)? mKeys[blockIdx.x-1] : NONE_KEY;
-    s_mScans[0] = (blockIdx.x > 0)? mScans[blockIdx.x-1] : gpu_gpair();
+    s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : NONE_KEY;
+    s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : gpu_gpair();
  }
  int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
-  int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid-1, keys, colIds,
+  int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid - 1, keys, colIds,
                                                      nodeStart, nUniqKeys);
  gpu_gpair myValue = scans[tid];
  __syncthreads();
@@ -162,9 +158,11 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
 }

 /**
- * @brief Performs fused reduce and scan by key functionality. It is assumed that
+ * @brief Performs fused reduce and scan by key functionality. It is assumed
+ * that
 *  the keys occur contiguously!
- * @param sums the output gradient reductions for each element performed key-wise
+ * @param sums the output gradient reductions for each element performed
+ * key-wise
 * @param scans the output gradient scans for each element performed key-wise
 * @param vals the gradients evaluated for each observation.
 * @param instIds instance ids for each element
@@ -179,20 +177,20 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
 * @param colIds column indices for each element in the array
 * @param nodeStart index of the leftmost node in the current level
 */
-template <typename node_id_t, int BLKDIM_L1L3=256, int BLKDIM_L2=512>
+template <typename node_id_t, int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
 void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
                     const int* instIds, const node_id_t* keys, int size,
                     int nUniqKeys, int nCols, gpu_gpair* tmpScans,
                     int* tmpKeys, const int* colIds, node_id_t nodeStart) {
  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
-  cudaMemset(sums, 0, nUniqKeys*nCols*sizeof(gpu_gpair));
-  cubScanByKeyL1<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
-      (scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
-       nodeStart, size);
+  cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(gpu_gpair));
+  cubScanByKeyL1<node_id_t, BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>(
+      scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
+      nodeStart, size);
  cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
-  cubScanByKeyL3<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
-      (sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
-       nodeStart, size);
+  cubScanByKeyL3<node_id_t, BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>(
+      sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
+      nodeStart, size);
 }

 }  // namespace exact
--- a/plugin/updater_gpu/src/exact/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/exact/gpu_builder.cuh
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights
+ * reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,18 +16,17 @@
 */
 #pragma once

-#include "../../../../src/tree/param.h"
-#include "xgboost/tree_updater.h"
-#include "cub/cub.cuh"
-#include "../common.cuh"
 #include <vector>
-#include "loss_functions.cuh"
-#include "gradients.cuh"
-#include "node.cuh"
+#include "../../../../src/tree/param.h"
+#include "../common.cuh"
 #include "argmax_by_key.cuh"
-#include "split2node.cuh"
+#include "cub/cub.cuh"
 #include "fused_scan_reduce_by_key.cuh"
-
+#include "gradients.cuh"
+#include "loss_functions.cuh"
+#include "node.cuh"
+#include "split2node.cuh"
+#include "xgboost/tree_updater.h"

 namespace xgboost {
 namespace tree {
@@ -48,8 +48,8 @@ template <typename node_id_t>
 __global__ void assignColIds(int* colIds, const int* colOffsets) {
  int myId = blockIdx.x;
  int start = colOffsets[myId];
-  int end = colOffsets[myId+1];
-  for (int id = start+threadIdx.x; id < end; id += blockDim.x) {
+  int end = colOffsets[myId + 1];
+  for (int id = start + threadIdx.x; id < end; id += blockDim.x) {
    colIds[id] = myId;
  }
 }
@@ -70,7 +70,7 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
  node_id_t result;
  if (n.isLeaf() || n.isUnused()) {
    result = UNUSED_NODE;
-  } else if(n.isDefaultLeft()) {
+  } else if (n.isDefaultLeft()) {
    result = (2 * n.id) + 1;
  } else {
    result = (2 * n.id) + 2;
@@ -81,8 +81,9 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
 template <typename node_id_t>
 __global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
                              const node_id_t* nodeIds, const int* instId,
-                              const Node<node_id_t>* nodes, const int* colOffsets,
-                              const float* vals, int nVals, int nCols) {
+                              const Node<node_id_t>* nodes,
+                              const int* colOffsets, const float* vals,
+                              int nVals, int nCols) {
  int id = threadIdx.x + (blockIdx.x * blockDim.x);
  const int stride = blockDim.x * gridDim.x;
  for (; id < nVals; id += stride) {
@@ -95,7 +96,7 @@ __global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
    if (nId != UNUSED_NODE) {
      const Node<node_id_t> n = nodes[nId];
      int colId = n.colIdx;
-      //printf("nid=%d colId=%d id=%d\n", nId, colId, id);
+      // printf("nid=%d colId=%d id=%d\n", nId, colId, id);
      int start = colOffsets[colId];
      int end = colOffsets[colId + 1];
      ///@todo: too much wasteful threads!!
@@ -114,20 +115,26 @@ __global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
    int lid = (id << 1) + 1;
    int rid = (id << 1) + 2;
    if ((lid >= len) || (rid >= len)) {
-      nodes[id].score = -FLT_MAX; // bottom-most nodes
+      nodes[id].score = -FLT_MAX;  // bottom-most nodes
    } else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
-      nodes[id].score = -FLT_MAX; // unused child nodes
+      nodes[id].score = -FLT_MAX;  // unused child nodes
    }
  }
 }

 // unit test forward declaration for friend function access
-template <typename node_id_t> void testSmallData();
-template <typename node_id_t> void testLargeData();
-template <typename node_id_t> void testAllocate();
-template <typename node_id_t> void testMarkLeaves();
-template <typename node_id_t> void testDense2Sparse();
-template <typename node_id_t> class GPUBuilder;
+template <typename node_id_t>
+void testSmallData();
+template <typename node_id_t>
+void testLargeData();
+template <typename node_id_t>
+void testAllocate();
+template <typename node_id_t>
+void testMarkLeaves();
+template <typename node_id_t>
+void testDense2Sparse();
+template <typename node_id_t>
+class GPUBuilder;
 template <typename node_id_t>
 std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
    const std::string& file,
@@ -136,7 +143,7 @@ std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
 template <typename node_id_t>
 class GPUBuilder {
 public:
-  GPUBuilder(): allocated(false) {}
+  GPUBuilder() : allocated(false) {}

  ~GPUBuilder() {}

@@ -146,10 +153,10 @@ class GPUBuilder {
    maxLeaves = 1 << param.max_depth;
  }

-  void UpdateParam(const TrainParam &param) { this->param = param; }
+  void UpdateParam(const TrainParam& param) { this->param = param; }

  /// @note: Update should be only after Init!!
-  void Update(const std::vector<bst_gpair>& gpair, DMatrix *hMat,
+  void Update(const std::vector<bst_gpair>& gpair, DMatrix* hMat,
              RegTree* hTree) {
    if (!allocated) {
      setupOneTimeData(*hMat);
@@ -171,7 +178,7 @@ class GPUBuilder {
    dense2sparse(*hTree);
  }

-private:
+ private:
  friend void testSmallData<node_id_t>();
  friend void testLargeData<node_id_t>();
  friend void testAllocate<node_id_t>();
@@ -194,7 +201,7 @@ private:
  dh::dvec<gpu_gpair> gradsInst;
  dh::dvec2<node_id_t> nodeAssigns;
  dh::dvec2<int> nodeLocations;
-  dh::dvec<Node<node_id_t> > nodes;
+  dh::dvec<Node<node_id_t>> nodes;
  dh::dvec<node_id_t> nodeAssignsPerInst;
  dh::dvec<gpu_gpair> gradSums;
  dh::dvec<gpu_gpair> gradScans;
@@ -218,35 +225,26 @@ private:
    argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
                vals.current(), colIds.data(), nodeAssigns.current(),
                nodes.data(), nNodes, nodeStart, nVals, param,
-                level<=MAX_ABK_LEVELS? ABK_SMEM : ABK_GMEM);
+                level <= MAX_ABK_LEVELS ? ABK_SMEM : ABK_GMEM);
    split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
-               gradSums.data(), vals.current(), colIds.data(), colOffsets.data(),
-               nodeAssigns.current(), nNodes, nodeStart, nCols, param);
+               gradSums.data(), vals.current(), colIds.data(),
+               colOffsets.data(), nodeAssigns.current(), nNodes, nodeStart,
+               nCols, param);
  }

  void allocateAllData(int offsetSize) {
    int tmpBuffSize = scanTempBufferSize(nVals);
-    ba.allocate(param.gpu_id,
-                &vals, nVals,
-                &vals_cached, nVals,
-                &instIds, nVals,
-                &instIds_cached, nVals,
-                &colOffsets, offsetSize,
-                &gradsInst, nRows,
-                &nodeAssigns, nVals,
-                &nodeLocations, nVals,
-                &nodes, maxNodes,
-                &nodeAssignsPerInst, nRows,
-                &gradSums, maxLeaves*nCols,
-                &gradScans, nVals,
-                &nodeSplits, maxLeaves,
-                &tmpScanGradBuff, tmpBuffSize,
-                &tmpScanKeyBuff, tmpBuffSize,
-                &colIds, nVals);
+    ba.allocate(dh::get_device_idx(param.gpu_id), &vals, nVals, &vals_cached,
+                nVals, &instIds, nVals, &instIds_cached, nVals, &colOffsets,
+                offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
+                &nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
+                nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals,
+                &nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize,
+                &tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
  }

  void setupOneTimeData(DMatrix& hMat) {
-    size_t free_memory = dh::available_memory(param.gpu_id);
+    size_t free_memory = dh::available_memory(dh::get_device_idx(param.gpu_id));
    if (!hMat.SingleColBlock()) {
      throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
    }
@@ -259,7 +257,8 @@ private:
    if (!param.silent) {
      const int mb_size = 1048576;
      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
-                   << free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
+                   << free_memory / mb_size << " MB on "
+                   << dh::device_name(dh::get_device_idx(param.gpu_id));
    }
  }

@@ -282,9 +281,10 @@ private:
    iter->BeforeFirst();
    while (iter->Next()) {
      const ColBatch& batch = iter->Value();
-      for (int i=0;i<batch.size;i++) {
+      for (int i = 0; i < batch.size; i++) {
        const ColBatch::Inst& col = batch[i];
-        for (const ColBatch::Entry* it=col.data;it!=col.data+col.length;it++) {
+        for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
+             it++) {
          int inst_id = static_cast<int>(it->index);
          fval.push_back(it->fvalue);
          fId.push_back(inst_id);
@@ -301,16 +301,17 @@ private:
    vals.current_dvec() = fval;
    instIds.current_dvec() = fId;
    colOffsets = offset;
-    segmentedSort<float,int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
+    segmentedSort<float, int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
    vals_cached = vals.current_dvec();
    instIds_cached = instIds.current_dvec();
-    assignColIds<node_id_t><<<nCols,512>>>(colIds.data(), colOffsets.data());
+    assignColIds<node_id_t><<<nCols, 512>>>(colIds.data(), colOffsets.data());
  }

  void transferGrads(const std::vector<bst_gpair>& gpair) {
    // HACK
    dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
-                             sizeof(gpu_gpair)*nRows, cudaMemcpyHostToDevice));
+                             sizeof(gpu_gpair) * nRows,
+                             cudaMemcpyHostToDevice));
    // evaluate the full-grad reduction for the root node
    sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
  }
@@ -324,25 +325,23 @@ private:
      // for root node, just update the gradient/score/weight/id info
      // before splitting it! Currently all data is on GPU, hence this
      // stupid little kernel
-      initRootNode<<<1,1>>>(nodes.data(), gradSums.data(), param);
+      initRootNode<<<1, 1>>>(nodes.data(), gradSums.data(), param);
    } else {
      const int BlkDim = 256;
      const int ItemsPerThread = 4;
      // assign default node ids first
      int nBlks = dh::div_round_up(nRows, BlkDim);
-      fillDefaultNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
-                                           nodes.data(), nRows);
+      fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.data(),
+                                            nodes.data(), nRows);
      // evaluate the correct child indices of non-missing values next
-      nBlks = dh::div_round_up(nVals, BlkDim*ItemsPerThread);
-      assignNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
-                                      nodeLocations.current(),
-                                      nodeAssigns.current(),
-                                      instIds.current(), nodes.data(),
-                                      colOffsets.data(), vals.current(),
-                                      nVals, nCols);
+      nBlks = dh::div_round_up(nVals, BlkDim * ItemsPerThread);
+      assignNodeIds<<<nBlks, BlkDim>>>(
+          nodeAssignsPerInst.data(), nodeLocations.current(),
+          nodeAssigns.current(), instIds.current(), nodes.data(),
+          colOffsets.data(), vals.current(), nVals, nCols);
      // gather the node assignments across all other columns too
-      gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
-                        instIds.current(), nVals);
+      gather<node_id_t>(dh::get_device_idx(param.gpu_id), nodeAssigns.current(),
+                        nodeAssignsPerInst.data(), instIds.current(), nVals);
      sortKeys(level);
    }
  }
@@ -351,9 +350,10 @@ private:
    // segmented-sort the arrays based on node-id's
    // but we don't need more than level+1 bits for sorting!
    segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
-                  0, level+1);
-    gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
-                      instIds.current(), nodeLocations.current(), nVals);
+                  0, level + 1);
+    gather<float, int>(dh::get_device_idx(param.gpu_id), vals.other(),
+                       vals.current(), instIds.other(), instIds.current(),
+                       nodeLocations.current(), nVals);
    vals.buff().selector ^= 1;
    instIds.buff().selector ^= 1;
  }
@@ -361,11 +361,11 @@ private:
  void markLeaves() {
    const int BlkDim = 128;
    int nBlks = dh::div_round_up(maxNodes, BlkDim);
-    markLeavesKernel<<<nBlks,BlkDim>>>(nodes.data(), maxNodes);
+    markLeavesKernel<<<nBlks, BlkDim>>>(nodes.data(), maxNodes);
  }

-  void dense2sparse(RegTree &tree) {
-    std::vector<Node<node_id_t> > hNodes = nodes.as_vector();
+  void dense2sparse(RegTree& tree) {
+    std::vector<Node<node_id_t>> hNodes = nodes.as_vector();
    int nodeId = 0;
    for (int i = 0; i < maxNodes; ++i) {
      const Node<node_id_t>& n = hNodes[i];
@@ -375,7 +375,7 @@ private:
        ++nodeId;
      } else if (!hNodes[i].isUnused()) {
        tree.AddChilds(nodeId);
-        tree[nodeId].set_split(n.colIdx, n.threshold, n.dir==LeftDir);
+        tree[nodeId].set_split(n.colIdx, n.threshold, n.dir == LeftDir);
        tree.stat(nodeId).loss_chg = n.score;
        tree.stat(nodeId).sum_hess = n.gradSum.h;
        tree.stat(nodeId).base_weight = n.weight;
--- a/plugin/updater_gpu/src/exact/gradients.cuh
+++ b/plugin/updater_gpu/src/exact/gradients.cuh
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights
+ * reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,6 @@

 #include "../common.cuh"

-
 namespace xgboost {
 namespace tree {
 namespace exact {
@@ -32,9 +32,9 @@ struct gpu_gpair {
  /** the 'h_i' as it appears in the xgboost paper */
  float h;

-  HOST_DEV_INLINE gpu_gpair(): g(0.f), h(0.f) {}
-  HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h): g(_g), h(_h) {}
-  HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a): g(a.g), h(a.h) {}
+  HOST_DEV_INLINE gpu_gpair() : g(0.f), h(0.f) {}
+  HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h) : g(_g), h(_h) {}
+  HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a) : g(a.g), h(a.h) {}

  /**
   * @brief Checks whether the hessian is more than the defined weight
@@ -60,12 +60,12 @@ struct gpu_gpair {

  HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
                                             const gpu_gpair& b) {
-    return gpu_gpair(a.g+b.g, a.h+b.h);
+    return gpu_gpair(a.g + b.g, a.h + b.h);
  }

  HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
                                             const gpu_gpair& b) {
-    return gpu_gpair(a.g-b.g, a.h-b.h);
+    return gpu_gpair(a.g - b.g, a.h - b.h);
  }

  HOST_DEV_INLINE gpu_gpair(int value) {
@@ -73,7 +73,6 @@ struct gpu_gpair {
  }
 };

-
 /**
 * @brief Gradient value getter function
 * @param id the index into the vals or instIds array to which to fetch
@@ -81,7 +80,8 @@ struct gpu_gpair {
 * @param instIds instance index buffer
 * @return the expected gradient value
 */
-HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, const int* instIds) {
+HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals,
+                              const int* instIds) {
  id = instIds[id];
  return vals[id];
 }
--- a/plugin/updater_gpu/src/exact/loss_functions.cuh
+++ b/plugin/updater_gpu/src/exact/loss_functions.cuh
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights
+ * reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,17 +19,13 @@
 #include "../common.cuh"
 #include "gradients.cuh"

-
 namespace xgboost {
 namespace tree {
 namespace exact {

-HOST_DEV_INLINE float device_calc_loss_chg(const TrainParam &param,
-                                           const gpu_gpair &scan,
-                                           const gpu_gpair &missing,
-                                           const gpu_gpair &parent_sum,
-                                           const float &parent_gain,
-                                           bool missing_left) {
+HOST_DEV_INLINE float device_calc_loss_chg(
+    const TrainParam &param, const gpu_gpair &scan, const gpu_gpair &missing,
+    const gpu_gpair &parent_sum, const float &parent_gain, bool missing_left) {
  gpu_gpair left = scan;
  if (missing_left) {
    left += missing;
--- a/plugin/updater_gpu/src/exact/node.cuh
+++ b/plugin/updater_gpu/src/exact/node.cuh
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights
+ * reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,9 +16,8 @@
 */
 #pragma once

-#include "gradients.cuh"
 #include "../common.cuh"
-
+#include "gradients.cuh"

 namespace xgboost {
 namespace tree {
@@ -34,11 +34,9 @@ enum DefaultDirection {
  RightDir
 };

-
 /** used to assign default id to a Node */
 static const int UNUSED_NODE = -1;

-
 /**
 * @struct Split node.cuh
 * @brief Abstraction of a possible split in the decision tree
@@ -49,7 +47,7 @@ struct Split {
  /** index where to split in the DMatrix */
  int index;

-  HOST_DEV_INLINE Split(): score(-FLT_MAX), index(INT_MAX) {}
+  HOST_DEV_INLINE Split() : score(-FLT_MAX), index(INT_MAX) {}

  /**
   * @brief Whether the split info is valid to be used to create a new child
@@ -61,7 +59,6 @@ struct Split {
  }
 };

-
 /**
 * @struct Node node.cuh
 * @brief Abstraction of a node in the decision tree
@@ -84,9 +81,14 @@ class Node {
  /** node id (used as key for reduce/scan) */
  node_id_t id;

-  HOST_DEV_INLINE Node(): gradSum(), score(-FLT_MAX), weight(-FLT_MAX),
-                          dir(LeftDir), threshold(0.f), colIdx(UNUSED_NODE),
-                          id(UNUSED_NODE) {}
+  HOST_DEV_INLINE Node()
+      : gradSum(),
+        score(-FLT_MAX),
+        weight(-FLT_MAX),
+        dir(LeftDir),
+        threshold(0.f),
+        colIdx(UNUSED_NODE),
+        id(UNUSED_NODE) {}

  /** Tells whether this node is part of the decision tree */
  HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
@@ -100,7 +102,6 @@ class Node {
  HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
 };

-
 /**
 * @struct Segment node.cuh
 * @brief Space inefficient, but super easy to implement structure to define
@@ -112,7 +113,7 @@ struct Segment {
  /** end index of the segment */
  int end;

-  HOST_DEV_INLINE Segment(): start(-1), end(-1) {}
+  HOST_DEV_INLINE Segment() : start(-1), end(-1) {}

  /** Checks whether the current structure defines a valid segment */
  HOST_DEV_INLINE bool isValid() const {
@@ -120,7 +121,6 @@ struct Segment {
  }
 };

-
 /**
 * @enum NodeType node.cuh
 * @brief Useful to decribe the node type in a dense BFS-order tree array
@@ -134,7 +134,6 @@ enum NodeType {
  UNUSED
 };

-
 /**
 * @brief Absolute BFS order IDs to col-wise unique IDs based on user input
 * @param tid the index of the element that this thread should access
--- a/plugin/updater_gpu/src/exact/split2node.cuh
+++ b/plugin/updater_gpu/src/exact/split2node.cuh
@@ -17,9 +17,8 @@

 #include "../../../../src/tree/param.h"
 #include "gradients.cuh"
-#include "node.cuh"
 #include "loss_functions.cuh"
-
+#include "node.cuh"

 namespace xgboost {
 namespace tree {
@@ -39,7 +38,7 @@ namespace exact {
 template <typename node_id_t>
 DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
                                   const gpu_gpair& grad,
-                                   const TrainParam &param) {
+                                   const TrainParam& param) {
  nodes[nid].gradSum = grad;
  nodes[nid].score = CalcGain(param, grad.g, grad.h);
  nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
@@ -58,18 +57,18 @@ DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
 template <typename node_id_t>
 DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
                                 const gpu_gpair& gradL, const gpu_gpair& gradR,
-                                 const TrainParam &param) {
+                                 const TrainParam& param) {
  int childId = (pid * 2) + 1;
  updateOneChildNode(nodes, childId, gradL, param);
-  updateOneChildNode(nodes, childId+1, gradR, param);
+  updateOneChildNode(nodes, childId + 1, gradR, param);
 }

 template <typename node_id_t>
 DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
-                                      const Node<node_id_t>& n, int absNodeId, int colId,
-                                      const gpu_gpair& gradScan,
+                                      const Node<node_id_t>& n, int absNodeId,
+                                      int colId, const gpu_gpair& gradScan,
                                      const gpu_gpair& colSum, float thresh,
-                                      const TrainParam &param) {
+                                      const TrainParam& param) {
  bool missingLeft = true;
  // get the default direction for the current node
  gpu_gpair missing = n.gradSum - colSum;
@@ -84,19 +83,17 @@ DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
  rGradSum = n.gradSum - lGradSum;
  updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
  // update default-dir, threshold and feature id for current node
-  nodes[absNodeId].dir = missingLeft? LeftDir : RightDir;
+  nodes[absNodeId].dir = missingLeft ? LeftDir : RightDir;
  nodes[absNodeId].colIdx = colId;
  nodes[absNodeId].threshold = thresh;
 }

-template <typename node_id_t, int BLKDIM=256>
-__global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits,
-                                 const gpu_gpair* gradScans,
-                                 const gpu_gpair* gradSums, const float* vals,
-                                 const int* colIds, const int* colOffsets,
-                                 const node_id_t* nodeAssigns, int nUniqKeys,
-                                 node_id_t nodeStart, int nCols,
-                                 const TrainParam param) {
+template <typename node_id_t, int BLKDIM = 256>
+__global__ void split2nodeKernel(
+    Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
+    const gpu_gpair* gradSums, const float* vals, const int* colIds,
+    const int* colOffsets, const node_id_t* nodeAssigns, int nUniqKeys,
+    node_id_t nodeStart, int nCols, const TrainParam param) {
  int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (uid >= nUniqKeys) {
    return;
@@ -105,11 +102,11 @@ __global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits
  Split s = nodeSplits[uid];
  if (s.isSplittable(param.min_split_loss)) {
    int idx = s.index;
-    int nodeInstId = abs2uniqKey(idx, nodeAssigns, colIds, nodeStart,
-                                 nUniqKeys);
-    updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId,
-                          colIds[idx], gradScans[idx],
-                          gradSums[nodeInstId], vals[idx], param);
+    int nodeInstId =
+        abs2uniqKey(idx, nodeAssigns, colIds, nodeStart, nUniqKeys);
+    updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId, colIds[idx],
+                          gradScans[idx], gradSums[nodeInstId], vals[idx],
+                          param);
  } else {
    // cannot be split further, so this node is a leaf!
    nodes[absNodeId].score = -FLT_MAX;
@@ -129,20 +126,20 @@ __global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits
 * @param nUniqKeys number of nodes that we are currently working on
 * @param nodeStart start offset of the nodes in the overall BFS tree
 * @param nCols number of columns
- * @param preUniquifiedKeys whether to uniquify the keys from inside kernel or not
+ * @param preUniquifiedKeys whether to uniquify the keys from inside kernel or
+ * not
 * @param param the training parameter struct
 */
-template <typename node_id_t, int BLKDIM=256>
-void split2node(Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
-                const gpu_gpair* gradSums, const float* vals, const int* colIds,
-                const int* colOffsets, const node_id_t* nodeAssigns,
-                int nUniqKeys, node_id_t nodeStart, int nCols,
-                const TrainParam param) {
+template <typename node_id_t, int BLKDIM = 256>
+void split2node(Node<node_id_t>* nodes, const Split* nodeSplits,
+                const gpu_gpair* gradScans, const gpu_gpair* gradSums,
+                const float* vals, const int* colIds, const int* colOffsets,
+                const node_id_t* nodeAssigns, int nUniqKeys,
+                node_id_t nodeStart, int nCols, const TrainParam param) {
  int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
-  split2nodeKernel<<<nBlks,BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
-                                     vals, colIds, colOffsets, nodeAssigns,
-                                     nUniqKeys, nodeStart, nCols,
-                                     param);
+  split2nodeKernel<<<nBlks, BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
+                                      vals, colIds, colOffsets, nodeAssigns,
+                                      nUniqKeys, nodeStart, nCols, param);
 }

 }  // namespace exact