[GPU-Plugin] Integration of a faster version of grow_gpu plugin into mainstream (#2360)

* Integrating a faster version of grow_gpu plugin 1. Removed the older files to reduce duplication 2. Moved all of the grow_gpu files under 'exact' folder 3. All of them are inside 'exact' namespace to avoid any conflicts 4. Fixed a bug in benchmark.py while running only 'grow_gpu' plugin 5. Added cub and googletest submodules to ease integration and unit-testing 6. Updates to CMakeLists.txt to directly build cuda objects into libxgboost * Added support for building gpu plugins through make flow 1. updated makefile and config.mk to add right targets 2. added unit-tests for gpu exact plugin code * 1. Added support for building gpu plugin using 'make' flow as well 2. Updated instructions for building and testing gpu plugin * Fix travis-ci errors for PR#2360 1. lint errors on unit-tests 2. removed googletest, instead depended upon dmlc-core provide gtest cache * Some more fixes to travis-ci lint failures PR#2360 * Added Rory's copyrights to the files containing code from both. * updated copyright statement as per Rory's request * moved the static datasets into a script to generate them at runtime * 1. memory usage print when silent=0 2. tests/ and test/ folder organization 3. removal of the dependency of googletest for just building xgboost 4. coding style updates for .cuh as well * Fixes for compilation warnings * add cuda object files as well when JVM_BINDINGS=ON
2017-06-06 03:09:53 +05:30 · 2017-06-06 03:09:53 +05:30 · 85b2fb3eee
commit 85b2fb3eee
parent 2d9052bc7d
37 changed files with 4118 additions and 1601 deletions
--- a/.gitignore
+++ b/.gitignore
@ -91,4 +91,6 @@ build_tests
 lib/

 # spark
-metastore_db
+metastore_db
+
+plugin/updater_gpu/test/cpp/data
--- a/.gitmodules
+++ b/.gitmodules
@ -4,3 +4,6 @@
 [submodule "rabit"]
 	path = rabit
 	url = https://github.com/dmlc/rabit
+[submodule "cub"]
+	path = cub
+	url = https://github.com/NVlabs/cub
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,10 +2,15 @@ cmake_minimum_required (VERSION 3.2)
 project (xgboost)
 find_package(OpenMP)

+option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
+if(PLUGIN_UPDATER_GPU)
+  find_package(CUDA REQUIRED)
+endif()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

 if(NOT MSVC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O3 -funroll-loops -msse2 -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O3 -funroll-loops -msse2 -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
 endif()


@ -21,13 +26,15 @@ else()
  CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11)
  CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X)

+  set(STD_FLAG "")
  if(COMPILER_SUPPORTS_CXX11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    set(STD_FLAG "-std=c++11")
  elseif(COMPILER_SUPPORTS_CXX0X)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
+    set(STD_FLAG "-std=c++0x")
  else()
    message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.")
  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STD_FLAG}")
 endif()

 #Make sure we are using the static runtime
@ -93,33 +100,33 @@ endif()

 set(LINK_LIBRARIES dmlccore rabit)

-option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
-
 if(PLUGIN_UPDATER_GPU)
-	#Find cub
-	set(CUB_DIRECTORY "" CACHE PATH "CUB 1.5.4 directory")
-	include_directories(${CUB_DIRECTORY})
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_35;-lineinfo;")
+    #Find cub
+    set(CUB_DIRECTORY "cub/" CACHE PATH "CUB 1.5.4 directory")
+    include_directories(${CUB_DIRECTORY})
+    #Find googletest
+    set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
+    include_directories(${GTEST_DIRECTORY}/include)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_60;-lineinfo;")
    if(NOT MSVC)
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
    endif()
-    set(SOURCES ${SOURCES}
-        plugin/updater_gpu/src/updater_gpu.cc
+    set(CUDA_SOURCES
+      plugin/updater_gpu/src/updater_gpu.cu
+      plugin/updater_gpu/src/gpu_hist_builder.cu
    )
-    find_package(CUDA QUIET REQUIRED)
-	file(GLOB_RECURSE CUDA_SOURCES "plugin/updater_gpu/src/*")
-    cuda_add_library(updater_gpu STATIC
-        ${CUDA_SOURCES}
-    )
-
-    set(LINK_LIBRARIES ${LINK_LIBRARIES} updater_gpu)
+    cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
+    set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
+else()
+    set(CUDA_OBJS "")
 endif()

 add_library(objxgboost OBJECT ${SOURCES})
 set_target_properties(${objxgboost} PROPERTIES POSITION_INDEPENDENT_CODE 1)

-add_library(libxgboost SHARED $<TARGET_OBJECTS:objxgboost>)
-add_executable(xgboost $<TARGET_OBJECTS:objxgboost>)
+add_library(libxgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
+add_executable(xgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
+
 target_link_libraries(xgboost ${LINK_LIBRARIES})
 target_link_libraries(libxgboost ${LINK_LIBRARIES})

@ -132,6 +139,7 @@ if(JVM_BINDINGS)

    add_library(libxgboost4j SHARED
        $<TARGET_OBJECTS:objxgboost>
+        ${CUDA_OBJS}
        jvm-packages/xgboost4j/src/native/xgboost4j.cpp)
    target_link_libraries(libxgboost4j
        ${LINK_LIBRARIES}
--- a/34
+++ b/34
@ -53,7 +53,7 @@ endif

 export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS)
 export CFLAGS=  -std=c++11 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
-CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include
+CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include -I$(GTEST_PATH)/include
 #java include path
 export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java

@ -84,12 +84,28 @@ ifeq ($(UNAME), Darwin)
 	JAVAINCFLAGS += -I${JAVA_HOME}/include/darwin
 endif

+OPENMP_FLAGS =
 ifeq ($(USE_OPENMP), 1)
-	CFLAGS += -fopenmp
+	OPENMP_FLAGS = -fopenmp
 else
-	CFLAGS += -DDISABLE_OPENMP
+	OPENMP_FLAGS = -DDISABLE_OPENMP
 endif
+CFLAGS += $(OPENMP_FLAGS)

+# for using GPUs
+COMPUTE ?= 60 35
+NVCC = nvcc
+INCLUDES = -Iinclude -I$(DMLC_CORE)/include -I$(RABIT)/include
+INCLUDES += -I$(CUB_PATH)
+INCLUDES += -I$(GTEST_PATH)/include
+CODE = $(foreach ver,$(COMPUTE),-gencode arch=compute_$(ver),code=sm_$(ver))
+NVCC_FLAGS = --std=c++11 $(CODE) $(INCLUDES) -lineinfo --expt-extended-lambda
+NVCC_FLAGS += -Xcompiler=$(OPENMP_FLAGS) -Xcompiler=-fPIC
+ifeq ($(PLUGIN_UPDATER_GPU),ON)
+  CUDA_ROOT = $(shell dirname $(shell dirname $(shell which $(NVCC))))
+  INCLUDES += -I$(CUDA_ROOT)/include
+  LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcudart
+endif

 # specify tensor path
 .PHONY: clean all lint clean_all doxygen rcpplint pypack Rpack Rbuild Rcheck java pylint
@ -113,11 +129,21 @@ ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP)
 CLI_OBJ = build/cli_main.o
 include tests/cpp/xgboost_test.mk

+# order of this rule matters wrt %.cc rule below!
+build/%.o: src/%.cu
+	@mkdir -p $(@D)
+	$(NVCC) -c $(NVCC_FLAGS) $< -o $@
+
 build/%.o: src/%.cc
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
 	$(CXX) -c $(CFLAGS) $< -o $@

+# order of this rule matters wrt %.cc rule below!
+build_plugin/%.o: plugin/%.cu
+	@mkdir -p $(@D)
+	$(NVCC) -c $(NVCC_FLAGS) $< -o $@
+
 build_plugin/%.o: plugin/%.cc
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
@ -158,6 +184,8 @@ pylint:
 	flake8 --ignore E501 tests/python

 test: $(ALL_TEST)
+	./plugin/updater_gpu/test/cpp/generate_data.sh
+	$(ALL_TEST)

 check: test
 	./tests/cpp/xgboost_test
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 89de7ab20167909bc2c4f8acd397671c47cf3c0d
--- a/include/xgboost/feature_map.h
+++ b/include/xgboost/feature_map.h
@ -63,7 +63,7 @@ class FeatureMap {
    return names_[idx].c_str();
  }
  /*! \return type of specific feature */
-  const Type type(size_t idx) const {
+  Type type(size_t idx) const {
    CHECK_LT(idx, names_.size()) << "FeatureMap feature index exceed bound";
    return types_[idx];
  }
--- a/make/config.mk
+++ b/make/config.mk
@ -54,10 +54,20 @@ TEST_COVER = 0

 # path to gtest library (only used when $BUILD_TEST=1)
 # there should be an include path in $GTEST_PATH/include and library in $GTEST_PATH/lib
-GTEST_PATH =
+GTEST_PATH ?= 
+
+# path to cub library (only used when $CUDA_ENABLED=1)
+# this should point to the cub project root folder
+CUB_PATH ?= cub

 # List of additional plugins, checkout plugin folder.
 # uncomment the following lines to include these plugins
 # you can also add your own plugin like this
 #
 # XGB_PLUGINS += plugin/example/plugin.mk
+
+# plugin to build tree on GPUs using CUDA
+PLUGIN_UPDATER_GPU ?= OFF
+ifeq ($(PLUGIN_UPDATER_GPU),ON)
+  XGB_PLUGINS += plugin/updater_gpu/plugin.mk
+endif
--- a/plugin/updater_gpu/README.md
+++ b/plugin/updater_gpu/README.md
@ -49,42 +49,59 @@ Training time time on 1000000 rows x 50 columns with 500 boosting iterations on
 ## Test
 To run tests:
 ```bash
-$ python -m nose test/
+$ python -m nose test/python/
 ```
 ## Dependencies
 A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).

 Building the plug-in requires CUDA Toolkit 7.5 or later.

-The plugin also depends on CUB 1.6.4 - https://nvlabs.github.io/cub/
-
-CUB is a header only cuda library which provides sort/reduce/scan primitives.
-

 ## Build
-To use the plugin xgboost must be built using cmake specifying the option PLUGIN_UPDATER_GPU=ON. The location of the CUB library must also be specified with the cmake variable CUB_DIRECTORY. CMake will prepare a build system depending on which platform you are on.

-From the command line on Windows or Linux starting from the xgboost directory:
+### Using cmake
+To use the plugin xgboost must be built by specifying the option PLUGIN_UPDATER_GPU=ON. CMake will prepare a build system depending on which platform you are on.

+On Linux, from the xgboost directory:
 ```bash
 $ mkdir build
 $ cd build
-$ cmake .. -DPLUGIN_UPDATER_GPU=ON -DCUB_DIRECTORY=<MY_CUB_DIRECTORY>
+$ cmake .. -DPLUGIN_UPDATER_GPU=ON
+$ make
 ```
+If 'make' fails try invoking make again. There can sometimes be problems with the order items are built.

 On Windows you may also need to specify your generator as 64 bit, so the cmake command becomes:
 ```bash
-$ cmake .. -G"Visual Studio 12 2013 Win64" -DPLUGIN_UPDATER_GPU=ON -DCUB_DIRECTORY=<MY_CUB_DIRECTORY>
+$ cmake .. -G"Visual Studio 12 2013 Win64" -DPLUGIN_UPDATER_GPU=ON
 ```
 You may also  be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.
+cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode. This is also a good time to check it is being built as x64. If not make sure the cmake generator is set correctly.

-On linux cmake will generate a Makefile in the build directory. Invoking the command 'make' from this directory will build the project. If the build fails try invoking make again. There can sometimes be problems with the order items are built.
+### Using make
+Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
+```bash
+# make sure CUDA SDK bin directory is in the 'PATH' env variable
+$ make PLUGIN_UPDATER_GPU=ON
+```

-On Windows cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode. This is also a good time to check it is being built as x64. If not make sure the cmake generator is set correctly.
+### For Developers!

-The build process generates an xgboost library and executable as normal but containing the GPU tree construction algorithm.
+Now, some of the code-base inside gpu plugins have googletest unit-tests inside 'tests/'.
+They can be enabled run along with other unit-tests inside '<xgboostRoot>/tests/cpp' using:
+```bash
+# make sure CUDA SDK bin directory is in the 'PATH' env variable
+# below 2 commands need only be executed once
+$ source ./dmlc-core/scripts/travis/travis_setup_env.sh
+$ make -f dmlc-core/scripts/packages.mk gtest
+$ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
+```

 ## Changelog
+##### 2017/5/31
+* Faster version of the grow_gpu plugin
+* Added support for building gpu plugin through 'make' flow too
+
 ##### 2017/5/5
 * Histogram performance improvements
 * Fix gcc build issues 
--- a/plugin/updater_gpu/benchmark/benchmark.py
+++ b/plugin/updater_gpu/benchmark/benchmark.py
@ -42,7 +42,7 @@ args = parser.parse_args()

 if 'grow_gpu_hist' in args.algorithm:
    run_benchmark(args, args.algorithm, 'grow_fast_histmaker')
-if 'grow_gpu ' in args.algorithm:
+if 'grow_gpu' in args.algorithm:
    run_benchmark(args, args.algorithm, 'grow_colmaker')
 if 'all' in args.algorithm:
    run_benchmark(args, 'grow_gpu', 'grow_colmaker')
--- a/plugin/updater_gpu/plugin.mk
+++ b/plugin/updater_gpu/plugin.mk
@ -0,0 +1,4 @@
+
+PLUGIN_OBJS += build_plugin/updater_gpu/src/updater_gpu.o \
+               build_plugin/updater_gpu/src/gpu_hist_builder.o
+PLUGIN_LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcudart
--- a/plugin/updater_gpu/src/common.cuh
+++ b/plugin/updater_gpu/src/common.cuh
@ -7,6 +7,11 @@
 #include "../../../src/tree/param.h"
 #include "device_helpers.cuh"
 #include "types.cuh"
+#include <string>
+#include <stdexcept>
+#include <cstdio>
+#include "cub/cub.cuh"
+#include "device_helpers.cuh"

 namespace xgboost {
 namespace tree {
@ -180,5 +185,95 @@ struct GpairCallbackOp {
  }
 };

+/**
+ * @brief Helper function to sort the pairs using cub's segmented RadixSortPairs
+ * @param tmp_mem cub temporary memory info
+ * @param keys keys double-buffer array
+ * @param vals the values double-buffer array
+ * @param nVals number of elements in the array
+ * @param nSegs number of segments
+ * @param offsets the segments
+ */
+template <typename T1, typename T2>
+void segmentedSort(dh::CubMemory &tmp_mem, dh::dvec2<T1> &keys, dh::dvec2<T2> &vals,
+                   int nVals, int nSegs, dh::dvec<int> &offsets, int start=0,
+                   int end=sizeof(T1)*8) {
+  size_t tmpSize;
+  dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+                    NULL, tmpSize, keys.buff(), vals.buff(), nVals, nSegs,
+                    offsets.data(), offsets.data()+1, start, end));
+  tmp_mem.LazyAllocate(tmpSize);
+  dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+                    tmp_mem.d_temp_storage, tmpSize, keys.buff(), vals.buff(),
+                    nVals, nSegs, offsets.data(), offsets.data()+1, start, end));
+}
+
+/**
+ * @brief Helper function to perform device-wide sum-reduction
+ * @param tmp_mem cub temporary memory info
+ * @param in the input array to be reduced
+ * @param out the output reduced value
+ * @param nVals number of elements in the input array
+ */
+template <typename T>
+void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
+                  int nVals) {
+  size_t tmpSize;
+  dh::safe_cuda(cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(),
+                                       nVals));
+  tmp_mem.LazyAllocate(tmpSize);
+  dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
+                                       in.data(), out.data(), nVals));
+}
+
+/**
+ * @brief Fill a given constant value across all elements in the buffer
+ * @param out the buffer to be filled
+ * @param len number of elements i the buffer
+ * @param def default value to be filled
+ */
+template <typename T, int BlkDim=256, int ItemsPerThread=4>
+void fillConst(T* out, int len, T def) {
+  dh::launch_n<ItemsPerThread,BlkDim>(len, [=] __device__(int i) { out[i] = def; });
+}
+
+/**
+ * @brief gather elements
+ * @param out1 output gathered array for the first buffer
+ * @param in1 first input buffer
+ * @param out2 output gathered array for the second buffer
+ * @param in2 second input buffer
+ * @param instId gather indices
+ * @param nVals length of the buffers
+ */
+template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
+void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
+            int nVals) {
+  dh::launch_n<ItemsPerThread,BlkDim>
+      (nVals, [=] __device__(int i) {
+                  int iid = instId[i];
+                  T1 v1 = in1[iid];
+                  T2 v2 = in2[iid];
+                  out1[i] = v1;
+                  out2[i] = v2;
+              });
+}
+
+/**
+ * @brief gather elements
+ * @param out output gathered array
+ * @param in input buffer
+ * @param instId gather indices
+ * @param nVals length of the buffers
+ */
+template <typename T, int BlkDim=256, int ItemsPerThread=4>
+void gather(T* out, const T* in, const int* instId, int nVals) {
+  dh::launch_n<ItemsPerThread,BlkDim>
+      (nVals, [=] __device__(int i) {
+                  int iid = instId[i];
+                  out[i] = in[iid];
+              });
+}
+
 }  // namespace tree
 }  // namespace xgboost
--- a/plugin/updater_gpu/src/device_helpers.cuh
+++ b/plugin/updater_gpu/src/device_helpers.cuh
@ -2,8 +2,6 @@
 * Copyright 2016 Rory mitchell
 */
 #pragma once
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
 #include <thrust/device_vector.h>
 #include <thrust/random.h>
 #include <thrust/system/cuda/error.h>
@ -14,6 +12,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <cub/cub.cuh>

 // Uncomment to enable
 // #define DEVICE_TIMER
@ -21,6 +20,9 @@

 namespace dh {

+#define HOST_DEV_INLINE  __host__ __device__ __forceinline__
+#define DEV_INLINE       __device__ __forceinline__
+
 /*
 * Error handling  functions
 */
@ -145,7 +147,7 @@ struct Timer {
  int64_t elapsed() const { return (ClockT::now() - start).count(); }
  void printElapsed(std::string label) {
    safe_cuda(cudaDeviceSynchronize());
-    printf("%s:\t %lld\n", label.c_str(), elapsed());
+    printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
    reset();
  }
 };
@ -228,10 +230,12 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
 */

 class bulk_allocator;
+template <typename T> class dvec2;

 template <typename T>
 class dvec {
  friend bulk_allocator;
+  friend class dvec2<T>;

 private:
  T *_ptr;
@ -241,15 +245,17 @@ class dvec {
    if (!empty()) {
      throw std::runtime_error("Tried to allocate dvec but already allocated");
    }
-
    _ptr = static_cast<T *>(ptr);
    _size = size;
  }

 public:
  dvec() : _ptr(NULL), _size(0) {}
+
  size_t size() const { return _size; }
+
  bool empty() const { return _ptr == NULL || _size == 0; }
+
  T *data() { return _ptr; }

  std::vector<T> as_vector() const {
@ -265,11 +271,9 @@ class dvec {

  void print() {
    auto h_vector = this->as_vector();
-
    for (auto e : h_vector) {
      std::cout << e << " ";
    }
-
    std::cout << "\n";
  }

@ -285,9 +289,7 @@ class dvec {
      throw std::runtime_error(
          "Cannot copy assign vector to dvec, sizes are different");
    }
-
    thrust::copy(other.begin(), other.end(), this->tbegin());
-
    return *this;
  }

@ -296,36 +298,74 @@ class dvec {
      throw std::runtime_error(
          "Cannot copy assign dvec to dvec, sizes are different");
    }
-
    thrust::copy(other.tbegin(), other.tend(), this->tbegin());
-
    return *this;
  }
 };

+/**
+ * @class dvec2 device_helpers.cuh
+ * @brief wrapper for storing 2 dvec's which are needed for cub::DoubleBuffer
+ */
+template <typename T>
+class dvec2 {
+  friend bulk_allocator;
+
+ private:
+  dvec<T> _d1, _d2;
+  cub::DoubleBuffer<T> _buff;
+
+  void external_allocate(void *ptr1, void *ptr2, size_t size) {
+    if (!empty()) {
+      throw std::runtime_error("Tried to allocate dvec2 but already allocated");
+    }
+    _d1.external_allocate(ptr1, size);
+    _d2.external_allocate(ptr2, size);
+    _buff.d_buffers[0] = static_cast<T *>(ptr1);
+    _buff.d_buffers[1] = static_cast<T *>(ptr2);
+    _buff.selector = 0;
+  }
+
+ public:
+  dvec2() : _d1(), _d2(), _buff() {}
+
+  size_t size() const { return _d1.size(); }
+
+  bool empty() const { return _d1.empty() || _d2.empty(); }
+
+  cub::DoubleBuffer<T> &buff() { return _buff; }
+
+  dvec<T> &d1() { return _d1; }
+
+  dvec<T> &d2() { return _d2; }
+
+  T *current() { return _buff.Current(); }
+
+  dvec<T> &current_dvec() { return _buff.selector == 0? d1() : d2(); }
+
+  T *other() { return _buff.Alternate(); }
+};
+
 class bulk_allocator {
  char *d_ptr;
  size_t _size;

-  const size_t align = 256;
+  const int align = 256;

  template <typename SizeT>
  size_t align_round_up(SizeT n) {
-    if (n % align == 0) {
-      return n;
-    } else {
-      return n + align - (n % align);
-    }
+    n = (n + align - 1) / align;
+    return n * align;
  }

  template <typename T, typename SizeT>
  size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size) {
-    return align_round_up(first_size * sizeof(T));
+    return align_round_up<SizeT>(first_size * sizeof(T));
  }

  template <typename T, typename SizeT, typename... Args>
  size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
-    return align_round_up(first_size * sizeof(T)) + get_size_bytes(args...);
+    return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
  }

  template <typename T, typename SizeT>
@ -336,11 +376,37 @@ class bulk_allocator {
  template <typename T, typename SizeT, typename... Args>
  void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
                     Args... args) {
-    first_vec->external_allocate(static_cast<void *>(ptr), first_size);
+    allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
    ptr += align_round_up(first_size * sizeof(T));
    allocate_dvec(ptr, args...);
  }

+  template <typename T, typename SizeT>
+  size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
+    return 2 * align_round_up(first_size * sizeof(T));
+  }
+
+  template <typename T, typename SizeT, typename... Args>
+  size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
+      return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
+  }
+
+  template <typename T, typename SizeT>
+  void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
+    first_vec->external_allocate
+        (static_cast<void *>(ptr),
+         static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
+         first_size);
+  }
+
+  template <typename T, typename SizeT, typename... Args>
+  void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
+                     Args... args) {
+    allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
+    ptr += (align_round_up(first_size * sizeof(T)) * 2);
+    allocate_dvec(ptr, args...);
+  }
+
 public:
  bulk_allocator() : _size(0), d_ptr(NULL) {}

@ -357,11 +423,8 @@ class bulk_allocator {
    if (d_ptr != NULL) {
      throw std::runtime_error("Bulk allocator already allocated");
    }
-
    _size = get_size_bytes(args...);
-
    safe_cuda(cudaMalloc(&d_ptr, _size));
-
    allocate_dvec(d_ptr, args...);
  }
 };
@ -374,6 +437,7 @@ struct CubMemory {
  CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}

  ~CubMemory() { Free(); }
+
  void Free() {
    if (d_temp_storage != NULL) {
      safe_cuda(cudaFree(d_temp_storage));
@ -394,13 +458,13 @@ struct CubMemory {
 inline size_t available_memory() {
  size_t device_free = 0;
  size_t device_total = 0;
-  dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
+  safe_cuda(cudaMemGetInfo(&device_free, &device_total));
  return device_free;
 }

 inline std::string device_name() {
  cudaDeviceProp prop;
-  dh::safe_cuda(cudaGetDeviceProperties(&prop, 0));
+  safe_cuda(cudaGetDeviceProperties(&prop, 0));
  return std::string(prop.name);
 }

@ -430,7 +494,6 @@ template <typename T>
 void print(char *label, const thrust::device_vector<T> &v,
           const char *format = "%d ", int max = 10) {
  thrust::host_vector<T> h_v = v;
-
  std::cout << label << ":\n";
  for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
    printf(format, h_v[i]);
@ -495,9 +558,21 @@ struct BernoulliRng {
    thrust::default_random_engine rng(seed);
    thrust::uniform_real_distribution<float> dist;
    rng.discard(i);
-
    return dist(rng) <= p;
  }
 };

+/**
+ * @brief Helper macro to measure timing on GPU
+ * @param call the GPU call
+ * @param name name used to track later
+ * @param stream cuda stream where to measure time
+ */
+#define TIMEIT(call, name)                  \
+  do {                                      \
+    dh::Timer t1234;                        \
+    call;                                   \
+    t1234.printElapsed(name);               \
+  } while(0)
+
 }  // namespace dh
--- a/plugin/updater_gpu/src/exact/argmax_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/argmax_by_key.cuh
@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/tree/param.h"
+#include "../common.cuh"
+#include "node.cuh"
+#include "loss_functions.cuh"
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @enum ArgMaxByKeyAlgo best_split_evaluation.cuh
+ * @brief Help decide which algorithm to use for multi-argmax operation
+ */
+enum ArgMaxByKeyAlgo {
+  /** simplest, use gmem-atomics for all updates */
+  ABK_GMEM = 0,
+  /** use smem-atomics for updates (when number of keys are less) */
+  ABK_SMEM
+};
+
+/** max depth until which to use shared mem based atomics for argmax */
+static const int MAX_ABK_LEVELS = 3;
+
+HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
+  Split out;
+  if (a.score < b.score) {
+    out.score = b.score;
+    out.index = b.index;
+  } else if (a.score == b.score) {
+    out.score = a.score;
+    out.index = (a.index < b.index)? a.index : b.index;
+  } else {
+    out.score = a.score;
+    out.index = a.index;
+  }
+  return out;
+}
+
+DEV_INLINE void atomicArgMax(Split* address, Split val) {
+  unsigned long long* intAddress = (unsigned long long*) address;
+  unsigned long long old = *intAddress;
+  unsigned long long assumed;
+  do {
+    assumed = old;
+    Split res = maxSplit(val, *(Split*)&assumed);
+    old = atomicCAS(intAddress, assumed, *(unsigned long long*)&res);
+  } while (assumed != old);
+}
+
+template <typename node_id_t>
+DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
+                                  const gpu_gpair* gradScans,
+                                  const gpu_gpair* gradSums, const float* vals,
+                                  const int* colIds,
+                                  const node_id_t* nodeAssigns,
+                                  const Node<node_id_t>* nodes, int nUniqKeys,
+                                  node_id_t nodeStart, int len,
+                                  const TrainParam &param) {
+  int nodeId = nodeAssigns[id];
+  ///@todo: this is really a bad check! but will be fixed when we move
+  ///   to key-based reduction
+  if ((id == 0) || !((nodeId == nodeAssigns[id-1]) &&
+                     (colIds[id] == colIds[id-1]) &&
+                     (vals[id] == vals[id-1]))) {
+    if (nodeId != UNUSED_NODE) {
+      int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart,
+                              nUniqKeys);
+      gpu_gpair colSum = gradSums[sumId];
+      int uid = nodeId - nodeStart;
+      Node<node_id_t> n = nodes[nodeId];
+      gpu_gpair parentSum = n.gradSum;
+      float parentGain = n.score;
+      bool tmp;
+      Split s;
+      gpu_gpair missing = parentSum - colSum;
+      s.score = loss_chg_missing(gradScans[id], missing, parentSum,
+                                 parentGain, param, tmp);
+      s.index = id;
+      atomicArgMax(nodeSplits+uid, s);
+    } // end if nodeId != UNUSED_NODE
+  } // end if id == 0 ...
+}
+
+template <typename node_id_t>
+__global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
+                                      const gpu_gpair* gradScans,
+                                      const gpu_gpair* gradSums,
+                                      const float* vals, const int* colIds,
+                                      const node_id_t* nodeAssigns,
+                                      const Node<node_id_t>* nodes, int nUniqKeys,
+                                      node_id_t nodeStart, int len,
+                                      const TrainParam param) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < len; id += stride) {
+    argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
+                      nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
+  }
+}
+
+template <typename node_id_t>
+__global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
+                                      const gpu_gpair* gradScans,
+                                      const gpu_gpair* gradSums,
+                                      const float* vals, const int* colIds,
+                                      const node_id_t* nodeAssigns,
+                                      const Node<node_id_t>* nodes, int nUniqKeys,
+                                      node_id_t nodeStart, int len,
+                                      const TrainParam param) {
+  extern __shared__ char sArr[];
+  Split* sNodeSplits = (Split*)sArr;
+  int tid = threadIdx.x;
+  Split defVal;
+  #pragma unroll 1
+  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
+    sNodeSplits[i] = defVal;
+  }
+  __syncthreads();
+  int id = tid + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < len; id += stride) {
+    argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
+                      nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
+  }
+  __syncthreads();
+  for (int i = tid; i < nUniqKeys; i += blockDim.x) {
+    Split s = sNodeSplits[i];
+    atomicArgMax(nodeSplits+i, s);
+  }
+}
+
+/**
+ * @brief Performs argmax_by_key functionality but for cases when keys need not
+ *  occur contiguously
+ * @param nodeSplits will contain information on best split for each node
+ * @param gradScans exclusive sum on sorted segments for each col
+ * @param gradSums gradient sum for each column in DMatrix based on to node-ids
+ * @param vals feature values
+ * @param colIds column index for each element in the feature values array
+ * @param nodeAssigns node-id assignments to each element in DMatrix
+ * @param nodes pointer to all nodes for this tree in BFS order
+ * @param nUniqKeys number of unique node-ids in this level
+ * @param nodeStart start index of the node-ids in this level
+ * @param len number of elements
+ * @param param training parameters
+ * @param algo which algorithm to use for argmax_by_key
+ */
+template <typename node_id_t, int BLKDIM=256, int ITEMS_PER_THREAD=4>
+void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
+                 const gpu_gpair* gradSums, const float* vals, const int* colIds,
+                 const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
+                 node_id_t nodeStart, int len, const TrainParam param,
+                 ArgMaxByKeyAlgo algo) {
+  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
+  int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
+  switch(algo) {
+  case ABK_GMEM:
+    atomicArgMaxByKeyGmem<node_id_t><<<nBlks,BLKDIM>>>
+        (nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+         nUniqKeys, nodeStart, len, param);
+    break;
+  case ABK_SMEM:
+    atomicArgMaxByKeySmem<node_id_t>
+        <<<nBlks,BLKDIM,sizeof(Split)*nUniqKeys>>>
+        (nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
+         nUniqKeys, nodeStart, len, param);
+    break;
+  default:
+    throw std::runtime_error("argMaxByKey: Bad algo passed!");
+  };
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common.cuh"
+#include "gradients.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @struct Pair fused_scan_reduce_by_key.cuh
+ * @brief Pair used for key basd scan operations on gpu_gpair
+ */
+struct Pair {
+  int key;
+  gpu_gpair value;
+};
+
+/** define a key that's not used at all in the entire boosting process */
+static const int NONE_KEY = -100;
+
+/**
+ * @brief Allocate temporary buffers needed for scan operations
+ * @param tmpScans gradient buffer
+ * @param tmpKeys keys buffer
+ * @param size number of elements that will be scanned
+ */
+template <int BLKDIM_L1L3=256>
+int scanTempBufferSize(int size) {
+  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
+  return nBlks;
+}
+
+struct AddByKey {
+  template <typename T>
+  HOST_DEV_INLINE T operator()(const T &first, const T &second) const {
+    T result;
+    if (first.key == second.key) {
+      result.key = first.key;
+      result.value = first.value + second.value;
+    } else {
+      result.key = second.key;
+      result.value = second.value;
+    }
+    return result;
+  }
+};
+
+template <typename node_id_t, int BLKDIM_L1L3>
+__global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
+                               const int* instIds, gpu_gpair* mScans,
+                               int* mKeys, const node_id_t* keys, int nUniqKeys,
+                               const int* colIds, node_id_t nodeStart,
+                               const int size) {
+  Pair rootPair = {NONE_KEY, gpu_gpair(0.f, 0.f)};
+  int myKey;
+  gpu_gpair myValue;
+  typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  Pair threadData;
+  int tid = blockIdx.x*BLKDIM_L1L3 + threadIdx.x;
+  if (tid < size) {
+    myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
+    myValue = get(tid, vals, instIds);
+  } else {
+    myKey = NONE_KEY;
+    myValue = 0.f;
+  }
+  threadData.key   = myKey;
+  threadData.value = myValue;
+  // get previous key, especially needed for the last thread in this block
+  // in order to pass on the partial scan values.
+  // this statement MUST appear before the checks below!
+  // else, the result of this shuffle operation will be undefined
+  int previousKey = __shfl_up(myKey, 1);
+  // Collectively compute the block-wide exclusive prefix sum
+  BlockScan(temp_storage).ExclusiveScan(threadData, threadData, rootPair,
+                                        AddByKey());
+  if (tid < size) {
+    scans[tid] = threadData.value;
+  } else {
+    return;
+  }
+  if (threadIdx.x == BLKDIM_L1L3 - 1) {
+    threadData.value = (myKey == previousKey)? 
+                        threadData.value :
+                        gpu_gpair(0.0f, 0.0f);
+    mKeys[blockIdx.x]  = myKey;
+    mScans[blockIdx.x] = threadData.value + myValue;
+  }
+}
+
+template <int BLKSIZE>
+__global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) {
+  typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
+  Pair threadData;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  for (int i = threadIdx.x; i < mLength; i += BLKSIZE-1) {
+    threadData.key   =  mKeys[i];
+    threadData.value = mScans[i];
+    BlockScan(temp_storage).InclusiveScan(threadData, threadData,
+                                          AddByKey());
+    mScans[i] = threadData.value;
+    __syncthreads();
+  }
+}
+
+template <typename node_id_t, int BLKDIM_L1L3>
+__global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
+                               const gpu_gpair* vals, const int* instIds,
+                               const gpu_gpair* mScans, const int* mKeys,
+                               const node_id_t* keys, int nUniqKeys,
+                               const int* colIds, node_id_t nodeStart,
+                               const int size) {
+  int relId = threadIdx.x;
+  int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
+  // to avoid the following warning from nvcc:
+  //   __shared__ memory variable with non-empty constructor or destructor
+  //     (potential race between threads)
+  __shared__ char gradBuff[sizeof(gpu_gpair)];
+  __shared__ int s_mKeys;
+  gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
+  if(tid >= size)
+    return;
+  // cache block-wide partial scan info
+  if (relId == 0) {
+    s_mKeys = (blockIdx.x > 0)? mKeys[blockIdx.x-1] : NONE_KEY;
+    s_mScans[0] = (blockIdx.x > 0)? mScans[blockIdx.x-1] : gpu_gpair();
+  }
+  int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
+  int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid-1, keys, colIds,
+                                                      nodeStart, nUniqKeys);
+  gpu_gpair myValue = scans[tid];
+  __syncthreads();
+  if (blockIdx.x > 0 && s_mKeys == previousKey) {
+    myValue += s_mScans[0];
+  }
+  if (tid == size - 1) {
+    sums[previousKey] = myValue + get(tid, vals, instIds);
+  }
+  if ((previousKey != myKey) && (previousKey >= 0)) {
+    sums[previousKey] = myValue;
+    myValue = gpu_gpair(0.0f, 0.0f);
+  }
+  scans[tid] = myValue;
+}
+
+/**
+ * @brief Performs fused reduce and scan by key functionality. It is assumed that
+ *  the keys occur contiguously!
+ * @param sums the output gradient reductions for each element performed key-wise
+ * @param scans the output gradient scans for each element performed key-wise
+ * @param vals the gradients evaluated for each observation.
+ * @param instIds instance ids for each element
+ * @param keys keys to be used to segment the reductions. They need not occur
+ *  contiguously in contrast to scan_by_key. Currently, we need one key per
+ *  value in the 'vals' array.
+ * @param size number of elements in the 'vals' array
+ * @param nUniqKeys max number of uniq keys found per column
+ * @param nCols number of columns
+ * @param tmpScans temporary scan buffer needed for cub-pyramid algo
+ * @param tmpKeys temporary key buffer needed for cub-pyramid algo
+ * @param colIds column indices for each element in the array
+ * @param nodeStart index of the leftmost node in the current level
+ */
+template <typename node_id_t, int BLKDIM_L1L3=256, int BLKDIM_L2=512>
+void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
+                     const int* instIds, const node_id_t* keys, int size,
+                     int nUniqKeys, int nCols, gpu_gpair* tmpScans,
+                     int* tmpKeys, const int* colIds, node_id_t nodeStart) {
+  int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
+  cudaMemset(sums, 0, nUniqKeys*nCols*sizeof(gpu_gpair));
+  cubScanByKeyL1<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
+      (scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
+       nodeStart, size);
+  cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
+  cubScanByKeyL3<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
+      (sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
+       nodeStart, size);
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/exact/gpu_builder.cuh
@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/tree/param.h"
+#include "xgboost/tree_updater.h"
+#include "cub/cub.cuh"
+#include "../common.cuh"
+#include <vector>
+#include "loss_functions.cuh"
+#include "gradients.cuh"
+#include "node.cuh"
+#include "argmax_by_key.cuh"
+#include "split2node.cuh"
+#include "fused_scan_reduce_by_key.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+template <typename node_id_t>
+__global__ void initRootNode(Node<node_id_t>* nodes, const gpu_gpair* sums,
+                             const TrainParam param) {
+  // gradients already evaluated inside transferGrads
+  Node<node_id_t> n;
+  n.gradSum = sums[0];
+  n.score = CalcGain(param, n.gradSum.g, n.gradSum.h);
+  n.weight = CalcWeight(param, n.gradSum.g, n.gradSum.h);
+  n.id = 0;
+  nodes[0] = n;
+}
+
+template <typename node_id_t>
+__global__ void assignColIds(int* colIds, const int* colOffsets) {
+  int myId = blockIdx.x;
+  int start = colOffsets[myId];
+  int end = colOffsets[myId+1];
+  for (int id = start+threadIdx.x; id < end; id += blockDim.x) {
+    colIds[id] = myId;
+  }
+}
+
+template <typename node_id_t>
+__global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
+                                   const Node<node_id_t>* nodes, int nRows) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (id >= nRows) {
+    return;
+  }
+  // if this element belongs to none of the currently active node-id's
+  node_id_t nId = nodeIdsPerInst[id];
+  if (nId == UNUSED_NODE) {
+    return;
+  }
+  const Node<node_id_t> n = nodes[nId];
+  node_id_t result;
+  if (n.isLeaf() || n.isUnused()) {
+    result = UNUSED_NODE;
+  } else if(n.isDefaultLeft()) {
+    result = (2 * n.id) + 1;
+  } else {
+    result = (2 * n.id) + 2;
+  }
+  nodeIdsPerInst[id] = result;
+}
+
+template <typename node_id_t>
+__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
+                              const node_id_t* nodeIds, const int* instId,
+                              const Node<node_id_t>* nodes, const int* colOffsets,
+                              const float* vals, int nVals, int nCols) {
+  int id = threadIdx.x + (blockIdx.x * blockDim.x);
+  const int stride = blockDim.x * gridDim.x;
+  for (; id < nVals; id += stride) {
+    // fusing generation of indices for node locations
+    nodeLocations[id] = id;
+    // using nodeIds here since the previous kernel would have updated
+    // the nodeIdsPerInst with all default assignments
+    int nId = nodeIds[id];
+    // if this element belongs to none of the currently active node-id's
+    if (nId != UNUSED_NODE) {
+      const Node<node_id_t> n = nodes[nId];
+      int colId = n.colIdx;
+      //printf("nid=%d colId=%d id=%d\n", nId, colId, id);
+      int start = colOffsets[colId];
+      int end = colOffsets[colId + 1];
+      ///@todo: too much wasteful threads!!
+      if ((id >= start) && (id < end) && !(n.isLeaf() || n.isUnused())) {
+        node_id_t result = (2 * n.id) + 1 + (vals[id] >= n.threshold);
+        nodeIdsPerInst[instId[id]] = result;
+      }
+    }
+  }
+}
+
+template <typename node_id_t>
+__global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
+  int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if ((id < len) && !nodes[id].isUnused()) {
+    int lid = (id << 1) + 1;
+    int rid = (id << 1) + 2;
+    if ((lid >= len) || (rid >= len)) {
+      nodes[id].score = -FLT_MAX; // bottom-most nodes
+    } else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
+      nodes[id].score = -FLT_MAX; // unused child nodes
+    }
+  }
+}
+
+// unit test forward declaration for friend function access
+template <typename node_id_t> void testSmallData();
+template <typename node_id_t> void testLargeData();
+template <typename node_id_t> void testAllocate();
+template <typename node_id_t> void testMarkLeaves();
+template <typename node_id_t> void testDense2Sparse();
+template <typename node_id_t> class GPUBuilder;
+template <typename node_id_t>
+std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
+    const std::string& file,
+    xgboost::tree::exact::GPUBuilder<node_id_t>& builder);
+
+template <typename node_id_t>
+class GPUBuilder {
+ public:
+  GPUBuilder(): allocated(false) {}
+
+  ~GPUBuilder() {}
+
+  void Init(const TrainParam& p) {
+    param = p;
+    maxNodes = (1 << (param.max_depth + 1)) - 1;
+    maxLeaves = 1 << param.max_depth;
+  }
+
+  void UpdateParam(const TrainParam &param) { this->param = param; }
+
+  /// @note: Update should be only after Init!!
+  void Update(const std::vector<bst_gpair>& gpair, DMatrix *hMat,
+              RegTree* hTree) {
+    if (!allocated) {
+      setupOneTimeData(*hMat);
+    }
+    for (int i = 0; i < param.max_depth; ++i) {
+      if (i == 0) {
+        // make sure to start on a fresh tree with sorted values!
+        vals.current_dvec() = vals_cached;
+        instIds.current_dvec() = instIds_cached;
+        transferGrads(gpair);
+      }
+      int nNodes = 1 << i;
+      node_id_t nodeStart = nNodes - 1;
+      initNodeData(i, nodeStart, nNodes);
+      findSplit(i, nodeStart, nNodes);
+    }
+    // mark all the used nodes with unused children as leaf nodes
+    markLeaves();
+    dense2sparse(*hTree);
+  }
+
+private:
+  friend void testSmallData<node_id_t>();
+  friend void testLargeData<node_id_t>();
+  friend void testAllocate<node_id_t>();
+  friend void testMarkLeaves<node_id_t>();
+  friend void testDense2Sparse<node_id_t>();
+  friend std::shared_ptr<xgboost::DMatrix> setupGPUBuilder<node_id_t>(
+      const std::string& file, GPUBuilder<node_id_t>& builder);
+
+  TrainParam param;
+  /** whether we have initialized memory already (so as not to repeat!) */
+  bool allocated;
+  /** feature values stored in column-major compressed format */
+  dh::dvec2<float> vals;
+  dh::dvec<float> vals_cached;
+  /** corresponding instance id's of these featutre values */
+  dh::dvec2<int> instIds;
+  dh::dvec<int> instIds_cached;
+  /** column offsets for these feature values */
+  dh::dvec<int> colOffsets;
+  dh::dvec<gpu_gpair> gradsInst;
+  dh::dvec2<node_id_t> nodeAssigns;
+  dh::dvec2<int> nodeLocations;
+  dh::dvec<Node<node_id_t> > nodes;
+  dh::dvec<node_id_t> nodeAssignsPerInst;
+  dh::dvec<gpu_gpair> gradSums;
+  dh::dvec<gpu_gpair> gradScans;
+  dh::dvec<Split> nodeSplits;
+  int nVals;
+  int nRows;
+  int nCols;
+  int maxNodes;
+  int maxLeaves;
+  dh::CubMemory tmp_mem;
+  dh::dvec<gpu_gpair> tmpScanGradBuff;
+  dh::dvec<int> tmpScanKeyBuff;
+  dh::dvec<int> colIds;
+  dh::bulk_allocator ba;
+
+  void findSplit(int level, node_id_t nodeStart, int nNodes) {
+    reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
+                    instIds.current(), nodeAssigns.current(), nVals, nNodes,
+                    nCols, tmpScanGradBuff.data(), tmpScanKeyBuff.data(),
+                    colIds.data(), nodeStart);
+    argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
+                vals.current(), colIds.data(), nodeAssigns.current(),
+                nodes.data(), nNodes, nodeStart, nVals, param,
+                level<=MAX_ABK_LEVELS? ABK_SMEM : ABK_GMEM);
+    split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
+               gradSums.data(), vals.current(), colIds.data(), colOffsets.data(),
+               nodeAssigns.current(), nNodes, nodeStart, nCols, param);
+  }
+
+  void allocateAllData(int offsetSize) {
+    int tmpBuffSize = scanTempBufferSize(nVals);
+    ba.allocate(&vals, nVals,
+                &vals_cached, nVals,
+                &instIds, nVals,
+                &instIds_cached, nVals,
+                &colOffsets, offsetSize,
+                &gradsInst, nRows,
+                &nodeAssigns, nVals,
+                &nodeLocations, nVals,
+                &nodes, maxNodes,
+                &nodeAssignsPerInst, nRows,
+                &gradSums, maxLeaves*nCols,
+                &gradScans, nVals,
+                &nodeSplits, maxLeaves,
+                &tmpScanGradBuff, tmpBuffSize,
+                &tmpScanKeyBuff, tmpBuffSize,
+                &colIds, nVals);
+  }
+
+  void setupOneTimeData(DMatrix& hMat) {
+    size_t free_memory = dh::available_memory();
+    if (!hMat.SingleColBlock()) {
+      throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
+    }
+    std::vector<float> fval;
+    std::vector<int> fId, offset;
+    convertToCsc(hMat, fval, fId, offset);
+    allocateAllData((int)offset.size());
+    transferAndSortData(fval, fId, offset);
+    allocated = true;
+    if (!param.silent) {
+      const int mb_size = 1048576;
+      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
+                   << free_memory / mb_size << " MB on " << dh::device_name();
+    }
+  }
+
+  void convertToCsc(DMatrix& hMat, std::vector<float>& fval,
+                    std::vector<int>& fId, std::vector<int>& offset) {
+    MetaInfo info = hMat.info();
+    nRows = info.num_row;
+    nCols = info.num_col;
+    offset.reserve(nCols + 1);
+    offset.push_back(0);
+    fval.reserve(nCols * nRows);
+    fId.reserve(nCols * nRows);
+    // in case you end up with a DMatrix having no column access
+    // then make sure to enable that before copying the data!
+    if (!hMat.HaveColAccess()) {
+      const std::vector<bool> enable(nCols, true);
+      hMat.InitColAccess(enable, 1, nRows);
+    }
+    dmlc::DataIter<ColBatch>* iter = hMat.ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch& batch = iter->Value();
+      for (int i=0;i<batch.size;i++) {
+        const ColBatch::Inst& col = batch[i];
+        for (const ColBatch::Entry* it=col.data;it!=col.data+col.length;it++) {
+          int inst_id = static_cast<int>(it->index);
+          fval.push_back(it->fvalue);
+          fId.push_back(inst_id);
+        }
+        offset.push_back(fval.size());
+      }
+    }
+    nVals = fval.size();
+  }
+
+  void transferAndSortData(const std::vector<float>& fval,
+                           const std::vector<int>& fId,
+                           const std::vector<int>& offset) {
+    vals.current_dvec() = fval;
+    instIds.current_dvec() = fId;
+    colOffsets = offset;
+    segmentedSort<float,int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
+    vals_cached = vals.current_dvec();
+    instIds_cached = instIds.current_dvec();
+    assignColIds<node_id_t><<<nCols,512>>>(colIds.data(), colOffsets.data());
+  }
+
+  void transferGrads(const std::vector<bst_gpair>& gpair) {
+    // HACK
+    dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
+                             sizeof(gpu_gpair)*nRows, cudaMemcpyHostToDevice));
+    // evaluate the full-grad reduction for the root node
+    sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
+  }
+
+  void initNodeData(int level, node_id_t nodeStart, int nNodes) {
+    // all instances belong to root node at the beginning!
+    if (level == 0) {
+      nodes.fill(Node<node_id_t>());
+      nodeAssigns.current_dvec().fill(0);
+      nodeAssignsPerInst.fill(0);
+      // for root node, just update the gradient/score/weight/id info
+      // before splitting it! Currently all data is on GPU, hence this
+      // stupid little kernel
+      initRootNode<<<1,1>>>(nodes.data(), gradSums.data(), param);
+    } else {
+      const int BlkDim = 256;
+      const int ItemsPerThread = 4;
+      // assign default node ids first
+      int nBlks = dh::div_round_up(nRows, BlkDim);
+      fillDefaultNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
+                                           nodes.data(), nRows);
+      // evaluate the correct child indices of non-missing values next
+      nBlks = dh::div_round_up(nVals, BlkDim*ItemsPerThread);
+      assignNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
+                                      nodeLocations.current(),
+                                      nodeAssigns.current(),
+                                      instIds.current(), nodes.data(),
+                                      colOffsets.data(), vals.current(),
+                                      nVals, nCols);
+      // gather the node assignments across all other columns too
+      gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
+                        instIds.current(), nVals);
+      sortKeys(level);
+    }
+  }
+
+  void sortKeys(int level) {
+    // segmented-sort the arrays based on node-id's
+    // but we don't need more than level+1 bits for sorting!
+    segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
+                  0, level+1);
+    gather<float,int>(vals.other(), vals.current(), instIds.other(),
+                      instIds.current(), nodeLocations.current(), nVals);
+    vals.buff().selector ^= 1;
+    instIds.buff().selector ^= 1;
+  }
+
+  void markLeaves() {
+    const int BlkDim = 128;
+    int nBlks = dh::div_round_up(maxNodes, BlkDim);
+    markLeavesKernel<<<nBlks,BlkDim>>>(nodes.data(), maxNodes);
+  }
+
+  void dense2sparse(RegTree &tree) {
+    std::vector<Node<node_id_t> > hNodes = nodes.as_vector();
+    int nodeId = 0;
+    for (int i = 0; i < maxNodes; ++i) {
+      const Node<node_id_t>& n = hNodes[i];
+      if ((i != 0) && hNodes[i].isLeaf()) {
+        tree[nodeId].set_leaf(n.weight * param.learning_rate);
+        tree.stat(nodeId).sum_hess = n.gradSum.h;
+        ++nodeId;
+      } else if (!hNodes[i].isUnused()) {
+        tree.AddChilds(nodeId);
+        tree[nodeId].set_split(n.colIdx, n.threshold, n.dir==LeftDir);
+        tree.stat(nodeId).loss_chg = n.score;
+        tree.stat(nodeId).sum_hess = n.gradSum.h;
+        tree.stat(nodeId).base_weight = n.weight;
+        tree[tree[nodeId].cleft()].set_leaf(0);
+        tree[tree[nodeId].cright()].set_leaf(0);
+        ++nodeId;
+      }
+    }
+  }
+};
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/gradients.cuh
+++ b/plugin/updater_gpu/src/exact/gradients.cuh
@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @struct gpu_gpair gradients.cuh
+ * @brief The first/second order gradients for iteratively building the tree
+ */
+struct gpu_gpair {
+  /** the 'g_i' as it appears in the xgboost paper */
+  float g;
+  /** the 'h_i' as it appears in the xgboost paper */
+  float h;
+
+  HOST_DEV_INLINE gpu_gpair(): g(0.f), h(0.f) {}
+  HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h): g(_g), h(_h) {}
+  HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a): g(a.g), h(a.h) {}
+
+  /**
+   * @brief Checks whether the hessian is more than the defined weight
+   * @param minWeight minimum weight to be compared against
+   * @return true if the hessian is greater than the minWeight
+   * @note this is useful in deciding whether to further split to child node
+   */
+  HOST_DEV_INLINE bool isSplittable(float minWeight) const {
+    return (h > minWeight);
+  }
+
+  HOST_DEV_INLINE gpu_gpair& operator+=(const gpu_gpair& a) {
+    g += a.g;
+    h += a.h;
+    return *this;
+  }
+
+  HOST_DEV_INLINE gpu_gpair& operator-=(const gpu_gpair& a) {
+    g -= a.g;
+    h -= a.h;
+    return *this;
+  }
+
+  HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
+                                             const gpu_gpair& b) {
+    return gpu_gpair(a.g+b.g, a.h+b.h);
+  }
+
+  HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
+                                             const gpu_gpair& b) {
+    return gpu_gpair(a.g-b.g, a.h-b.h);
+  }
+
+  HOST_DEV_INLINE gpu_gpair(int value) {
+    *this = gpu_gpair((float)value, (float)value);
+  }
+};
+
+
+/**
+ * @brief Gradient value getter function
+ * @param id the index into the vals or instIds array to which to fetch
+ * @param vals the gradient value buffer
+ * @param instIds instance index buffer
+ * @return the expected gradient value
+ */
+HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, const int* instIds) {
+  id = instIds[id];
+  return vals[id];
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/loss_functions.cuh
+++ b/plugin/updater_gpu/src/exact/loss_functions.cuh
@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common.cuh"
+#include "gradients.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+HOST_DEV_INLINE float device_calc_loss_chg(const TrainParam &param,
+                                           const gpu_gpair &scan,
+                                           const gpu_gpair &missing,
+                                           const gpu_gpair &parent_sum,
+                                           const float &parent_gain,
+                                           bool missing_left) {
+  gpu_gpair left = scan;
+  if (missing_left) {
+    left += missing;
+  }
+  gpu_gpair right = parent_sum - left;
+  float left_gain = CalcGain(param, left.g, left.h);
+  float right_gain = CalcGain(param, right.g, right.h);
+  return left_gain + right_gain - parent_gain;
+}
+
+HOST_DEV_INLINE float loss_chg_missing(const gpu_gpair &scan,
+                                       const gpu_gpair &missing,
+                                       const gpu_gpair &parent_sum,
+                                       const float &parent_gain,
+                                       const TrainParam &param,
+                                       bool &missing_left_out) {
+  float missing_left_loss =
+      device_calc_loss_chg(param, scan, missing, parent_sum, parent_gain, true);
+  float missing_right_loss = device_calc_loss_chg(
+      param, scan, missing, parent_sum, parent_gain, false);
+  if (missing_left_loss >= missing_right_loss) {
+    missing_left_out = true;
+    return missing_left_loss;
+  } else {
+    missing_left_out = false;
+    return missing_right_loss;
+  }
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/node.cuh
+++ b/plugin/updater_gpu/src/exact/node.cuh
@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "gradients.cuh"
+#include "../common.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @enum DefaultDirection node.cuh
+ * @brief Default direction to be followed in case of missing values
+ */
+enum DefaultDirection {
+  /** move to left child */
+  LeftDir = 0,
+  /** move to right child */
+  RightDir
+};
+
+
+/** used to assign default id to a Node */
+static const int UNUSED_NODE = -1;
+
+
+/**
+ * @struct Split node.cuh
+ * @brief Abstraction of a possible split in the decision tree
+ */
+struct Split {
+  /** the optimal gain score for this node */
+  float score;
+  /** index where to split in the DMatrix */
+  int index;
+
+  HOST_DEV_INLINE Split(): score(-FLT_MAX), index(INT_MAX) {}
+
+  /**
+   * @brief Whether the split info is valid to be used to create a new child
+   * @param minSplitLoss minimum score above which decision to split is made
+   * @return true if splittable, else false
+   */
+  HOST_DEV_INLINE bool isSplittable(float minSplitLoss) const {
+    return ((score >= minSplitLoss) && (index != INT_MAX));
+  }
+};
+
+
+/**
+ * @struct Node node.cuh
+ * @brief Abstraction of a node in the decision tree
+ */
+template <typename node_id_t>
+class Node {
+ public:
+  /** sum of gradients across all training samples part of this node */
+  gpu_gpair gradSum;
+  /** the optimal score for this node */
+  float score;
+  /** weightage for this node */
+  float weight;
+  /** default direction for missing values */
+  DefaultDirection dir;
+  /** threshold value for comparison */
+  float threshold;
+  /** column (feature) index whose value needs to be compared in this node */
+  int colIdx;
+  /** node id (used as key for reduce/scan) */
+  node_id_t id;
+
+  HOST_DEV_INLINE Node(): gradSum(), score(-FLT_MAX), weight(-FLT_MAX),
+                          dir(LeftDir), threshold(0.f), colIdx(UNUSED_NODE),
+                          id(UNUSED_NODE) {}
+
+  /** Tells whether this node is part of the decision tree */
+  HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
+
+  /** Tells whether this node is a leaf of the decision tree */
+  HOST_DEV_INLINE bool isLeaf() const {
+    return (!isUnused() && (score == -FLT_MAX));
+  }
+
+  /** Tells whether default direction is left child or not */
+  HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
+};
+
+
+/**
+ * @struct Segment node.cuh
+ * @brief Space inefficient, but super easy to implement structure to define
+ *   the start and end of a segment in the input array
+ */
+struct Segment {
+  /** start index of the segment */
+  int start;
+  /** end index of the segment */
+  int end;
+
+  HOST_DEV_INLINE Segment(): start(-1), end(-1) {}
+
+  /** Checks whether the current structure defines a valid segment */
+  HOST_DEV_INLINE bool isValid() const {
+    return !((start == -1) || (end == -1));
+  }
+};
+
+
+/**
+ * @enum NodeType node.cuh
+ * @brief Useful to decribe the node type in a dense BFS-order tree array
+ */
+enum NodeType {
+  /** a non-leaf node */
+  NODE = 0,
+  /** leaf node */
+  LEAF,
+  /** unused node */
+  UNUSED
+};
+
+
+/**
+ * @brief Absolute BFS order IDs to col-wise unique IDs based on user input
+ * @param tid the index of the element that this thread should access
+ * @param abs the array of absolute IDs
+ * @param colIds the array of column IDs for each element
+ * @param nodeStart the start of the node ID at this level
+ * @param nKeys number of nodes at this level.
+ * @return the uniq key
+ */
+template <typename node_id_t>
+HOST_DEV_INLINE int abs2uniqKey(int tid, const node_id_t* abs,
+                                const int* colIds, node_id_t nodeStart,
+                                int nKeys) {
+  int a = abs[tid];
+  if (a == UNUSED_NODE) return a;
+  return ((a - nodeStart) + (colIds[tid] * nKeys));
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/exact/split2node.cuh
+++ b/plugin/updater_gpu/src/exact/split2node.cuh
@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../../../src/tree/param.h"
+#include "gradients.cuh"
+#include "node.cuh"
+#include "loss_functions.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+/**
+ * @brief Helper function to update the child node based on the current status
+ *  of its parent node
+ * @param nodes the nodes array in which the position at 'nid' will be updated
+ * @param nid the nodeId in the 'nodes' array corresponding to this child node
+ * @param grad gradient sum for this child node
+ * @param minChildWeight minimum child weight for the split
+ * @param alpha L1 regularizer for weight updates
+ * @param lambda lambda as in xgboost
+ * @param maxStep max weight step update
+ */
+template <typename node_id_t>
+DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
+                                   const gpu_gpair& grad,
+                                   const TrainParam &param) {
+  nodes[nid].gradSum = grad;
+  nodes[nid].score = CalcGain(param, grad.g, grad.h);
+  nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
+  nodes[nid].id = nid;
+}
+
+/**
+ * @brief Helper function to update the child nodes based on the current status
+ *  of their parent node
+ * @param nodes the nodes array in which the position at 'nid' will be updated
+ * @param pid the nodeId of the parent
+ * @param gradL gradient sum for the left child node
+ * @param gradR gradient sum for the right child node
+ * @param param the training parameter struct
+ */
+template <typename node_id_t>
+DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
+                                 const gpu_gpair& gradL, const gpu_gpair& gradR,
+                                 const TrainParam &param) {
+  int childId = (pid * 2) + 1;
+  updateOneChildNode(nodes, childId, gradL, param);
+  updateOneChildNode(nodes, childId+1, gradR, param);
+}
+
+template <typename node_id_t>
+DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
+                                      const Node<node_id_t>& n, int absNodeId, int colId,
+                                      const gpu_gpair& gradScan,
+                                      const gpu_gpair& colSum, float thresh,
+                                      const TrainParam &param) {
+  bool missingLeft = true;
+  // get the default direction for the current node
+  gpu_gpair missing = n.gradSum - colSum;
+  loss_chg_missing(gradScan, missing, n.gradSum, n.score, param, missingLeft);
+  // get the score/weight/id/gradSum for left and right child nodes
+  gpu_gpair lGradSum, rGradSum;
+  if (missingLeft) {
+    lGradSum = gradScan + n.gradSum - colSum;
+  } else {
+    lGradSum = gradScan;
+  }
+  rGradSum = n.gradSum - lGradSum;
+  updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
+  // update default-dir, threshold and feature id for current node
+  nodes[absNodeId].dir = missingLeft? LeftDir : RightDir;
+  nodes[absNodeId].colIdx = colId;
+  nodes[absNodeId].threshold = thresh;
+}
+
+template <typename node_id_t, int BLKDIM=256>
+__global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits,
+                                 const gpu_gpair* gradScans,
+                                 const gpu_gpair* gradSums, const float* vals,
+                                 const int* colIds, const int* colOffsets,
+                                 const node_id_t* nodeAssigns, int nUniqKeys,
+                                 node_id_t nodeStart, int nCols,
+                                 const TrainParam param) {
+  int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (uid >= nUniqKeys) {
+    return;
+  }
+  int absNodeId = uid + nodeStart;
+  Split s = nodeSplits[uid];
+  if (s.isSplittable(param.min_split_loss)) {
+    int idx = s.index;
+    int nodeInstId = abs2uniqKey(idx, nodeAssigns, colIds, nodeStart,
+                                 nUniqKeys);
+    updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId,
+                          colIds[idx], gradScans[idx],
+                          gradSums[nodeInstId], vals[idx], param);
+  } else {
+    // cannot be split further, so this node is a leaf!
+    nodes[absNodeId].score = -FLT_MAX;
+  }
+}
+
+/**
+ * @brief function to convert split information into node
+ * @param nodes the output nodes
+ * @param nodeSplits split information
+ * @param gradScans scan of sorted gradients across columns
+ * @param gradSums key-wise gradient reduction across columns
+ * @param vals the feature values
+ * @param colIds column indices for each element in the array
+ * @param colOffsets column segment offsets
+ * @param nodeAssigns node-id assignment to every feature value
+ * @param nUniqKeys number of nodes that we are currently working on
+ * @param nodeStart start offset of the nodes in the overall BFS tree
+ * @param nCols number of columns
+ * @param preUniquifiedKeys whether to uniquify the keys from inside kernel or not
+ * @param param the training parameter struct
+ */
+template <typename node_id_t, int BLKDIM=256>
+void split2node(Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
+                const gpu_gpair* gradSums, const float* vals, const int* colIds,
+                const int* colOffsets, const node_id_t* nodeAssigns,
+                int nUniqKeys, node_id_t nodeStart, int nCols,
+                const TrainParam param) {
+  int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
+  split2nodeKernel<<<nBlks,BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
+                                     vals, colIds, colOffsets, nodeAssigns,
+                                     nUniqKeys, nodeStart, nCols,
+                                     param);
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/src/find_split.cuh
+++ b/plugin/updater_gpu/src/find_split.cuh
@ -1,110 +0,0 @@
-/*!
- * Copyright 2016 Rory mitchell
-*/
-#pragma once
-#include <cub/cub.cuh>
-#include <xgboost/base.h>
-#include <vector>
-#include "device_helpers.cuh"
-#include "find_split_multiscan.cuh"
-#include "find_split_sorting.cuh"
-#include "gpu_data.cuh"
-#include "types.cuh"
-
-namespace xgboost {
-namespace tree {
-
-__global__ void
-reduce_split_candidates_kernel(Split *d_split_candidates, Node *d_current_nodes,
-                               Node *d_new_nodes, int n_current_nodes,
-                               int n_features, const GPUTrainingParam param) {
-  int nid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (nid >= n_current_nodes) {
-    return;
-  }
-
-  // Find best split for each node
-  Split best;
-
-  for (int i = 0; i < n_features; i++) {
-    best.Update(d_split_candidates[n_current_nodes * i + nid]);
-  }
-
-  // Update current node
-  d_current_nodes[nid].split = best;
-
-  // Generate new nodes
-  d_new_nodes[nid * 2] =
-      Node(best.left_sum,
-           CalcGain(param, best.left_sum.grad(), best.left_sum.hess()),
-           CalcWeight(param, best.left_sum.grad(), best.left_sum.hess()));
-  d_new_nodes[nid * 2 + 1] =
-      Node(best.right_sum,
-           CalcGain(param, best.right_sum.grad(), best.right_sum.hess()),
-           CalcWeight(param, best.right_sum.grad(), best.right_sum.hess()));
-}
-
-void reduce_split_candidates(Split *d_split_candidates, Node *d_nodes,
-                             int level, int n_features,
-                             const GPUTrainingParam param) {
-  // Current level nodes (before split)
-  Node *d_current_nodes = d_nodes + (1 << (level)) - 1;
-  // Next level nodes (after split)
-  Node *d_new_nodes = d_nodes + (1 << (level + 1)) - 1;
-  // Number of existing nodes on this level
-  int n_current_nodes = 1 << level;
-
-  const int BLOCK_THREADS = 512;
-  const int GRID_SIZE = dh::div_round_up(n_current_nodes, BLOCK_THREADS);
-
-  reduce_split_candidates_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(
-      d_split_candidates, d_current_nodes, d_new_nodes, n_current_nodes,
-      n_features, param);
-  dh::safe_cuda(cudaDeviceSynchronize());
-}
-
-void colsample_level(GPUData *data, const TrainParam xgboost_param,
-                     const std::vector<int> &feature_set_tree,
-                     std::vector<int> *feature_set_level) {
-  unsigned n_bytree =
-      static_cast<unsigned>(xgboost_param.colsample_bytree * data->n_features);
-  unsigned n =
-      static_cast<unsigned>(n_bytree * xgboost_param.colsample_bylevel);
-  CHECK_GT(n, 0);
-
-  *feature_set_level = feature_set_tree;
-
-  std::shuffle((*feature_set_level).begin(),
-               (*feature_set_level).begin() + n_bytree, common::GlobalRandom());
-
-  data->feature_set = *feature_set_level;
-
-  data->feature_flags.fill(0);
-  auto d_feature_set = data->feature_set.data();
-  auto d_feature_flags = data->feature_flags.data();
-
-  dh::launch_n(
-      n, [=] __device__(int i) { d_feature_flags[d_feature_set[i]] = 1; });
-}
-
-void find_split(GPUData *data, const TrainParam xgboost_param, const int level,
-                bool multiscan_algorithm,
-                const std::vector<int> &feature_set_tree,
-                std::vector<int> *feature_set_level) {
-  colsample_level(data, xgboost_param, feature_set_tree, feature_set_level);
-  // Reset split candidates
-  data->split_candidates.fill(Split());
-
-  if (multiscan_algorithm) {
-    find_split_candidates_multiscan(data, level);
-  } else {
-    find_split_candidates_sorted(data, level);
-  }
-
-  // Find the best split for each node
-  reduce_split_candidates(data->split_candidates.data(), data->nodes.data(),
-                          level, data->n_features, data->param);
-}
-}  // namespace tree
-}  // namespace xgboost
--- a/plugin/updater_gpu/src/find_split_multiscan.cuh
+++ b/plugin/updater_gpu/src/find_split_multiscan.cuh
@ -1,723 +0,0 @@
-/*!
- * Copyright 2016 Rory mitchell
-*/
-#pragma once
-#include <cub/cub.cuh>
-#include <xgboost/base.h>
-#include "device_helpers.cuh"
-#include "gpu_data.cuh"
-#include "types.cuh"
-#include "common.cuh"
-
-namespace xgboost {
-namespace tree {
-
-typedef uint64_t BitFlagSet;
-
-__device__ __inline__ void set_bit(BitFlagSet &bf, int index) { // NOLINT
-  bf |= 1 << index;
-}
-
-__device__ __inline__ bool check_bit(BitFlagSet bf, int index) {
-  return (bf >> index) & 1;
-}
-
-// Carryover prefix for scanning multiple tiles of bit flags
-struct FlagPrefixCallbackOp {
-  BitFlagSet tile_carry;
-
-  __device__ FlagPrefixCallbackOp() : tile_carry(0) {}
-
-  __device__ BitFlagSet operator()(BitFlagSet block_aggregate) {
-    BitFlagSet old_prefix = tile_carry;
-    tile_carry |= block_aggregate;
-    return old_prefix;
-  }
-};
-
-// Scan op for bit flags that resets if the final bit is set
-struct FlagScanOp {
-  __device__ __forceinline__ BitFlagSet operator()(const BitFlagSet &a,
-                                                   const BitFlagSet &b) {
-    if (check_bit(b, 63)) {
-      return b;
-    } else {
-      return a | b;
-    }
-  }
-};
-
-template <int _BLOCK_THREADS, int _N_NODES, bool _DEBUG_VALIDATE>
-struct FindSplitParamsMultiscan {
-  enum {
-    BLOCK_THREADS = _BLOCK_THREADS,
-    TILE_ITEMS = BLOCK_THREADS,
-    N_NODES = _N_NODES,
-    N_WARPS = _BLOCK_THREADS / 32,
-    DEBUG_VALIDATE = _DEBUG_VALIDATE,
-    ITEMS_PER_THREAD = 1
-  };
-};
-
-template <int _BLOCK_THREADS, int _N_NODES, bool _DEBUG_VALIDATE>
-struct ReduceParamsMultiscan {
-  enum {
-    BLOCK_THREADS = _BLOCK_THREADS,
-    ITEMS_PER_THREAD = 1,
-    TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-    N_NODES = _N_NODES,
-    N_WARPS = _BLOCK_THREADS / 32,
-    DEBUG_VALIDATE = _DEBUG_VALIDATE
-  };
-};
-
-template <typename ParamsT> struct ReduceEnactorMultiscan {
-  typedef cub::WarpReduce<gpu_gpair> WarpReduceT;
-
-  struct _TempStorage {
-    typename WarpReduceT::TempStorage warp_reduce[ParamsT::N_WARPS];
-    gpu_gpair partial_sums[ParamsT::N_NODES][ParamsT::N_WARPS];
-  };
-
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-
-  struct _Reduction {
-    gpu_gpair node_sums[ParamsT::N_NODES];
-  };
-
-  struct Reduction : cub::Uninitialized<_Reduction> {};
-
-  // Thread local member variables
-  const ItemIter item_iter;
-  _TempStorage &temp_storage;
-  _Reduction &reduction;
-  gpu_gpair gpair;
-  NodeIdT node_id;
-  NodeIdT node_id_adjusted;
-  const int node_begin;
-
-  __device__ __forceinline__
-  ReduceEnactorMultiscan(TempStorage &temp_storage, // NOLINT
-                         Reduction &reduction,      // NOLINT
-                         const ItemIter item_iter, const int node_begin)
-      : temp_storage(temp_storage.Alias()), reduction(reduction.Alias()),
-        item_iter(item_iter), node_begin(node_begin) {}
-
-  __device__ __forceinline__ void ResetPartials() {
-    if (threadIdx.x < ParamsT::N_WARPS) {
-      for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-        temp_storage.partial_sums[NODE][threadIdx.x] = gpu_gpair();
-      }
-    }
-  }
-
-  __device__ __forceinline__ void ResetReduction() {
-    if (threadIdx.x < ParamsT::N_NODES) {
-      reduction.node_sums[threadIdx.x] = gpu_gpair();
-    }
-  }
-
-  __device__ __forceinline__ void LoadTile(const bst_uint &offset,
-                                           const bst_uint &num_remaining) {
-    if (threadIdx.x < num_remaining) {
-      bst_uint i = offset + threadIdx.x;
-      gpair = thrust::get<0>(item_iter[i]);
-      // gpair = d_items[offset + threadIdx.x].gpair;
-      // node_id = d_node_id[offset + threadIdx.x];
-      node_id = thrust::get<2>(item_iter[i]);
-      node_id_adjusted = node_id - node_begin;
-    } else {
-      gpair = gpu_gpair();
-      node_id = -1;
-      node_id_adjusted = -1;
-    }
-  }
-
-  __device__ __forceinline__ void ProcessTile(const bst_uint &offset,
-                                              const bst_uint &num_remaining) {
-    LoadTile(offset, num_remaining);
-
-    // Warp synchronous reduction
-    for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-      bool active = node_id_adjusted == NODE;
-
-      unsigned int ballot = __ballot(active);
-
-      int warp_id = threadIdx.x / 32;
-      int lane_id = threadIdx.x % 32;
-
-      if (ballot == 0) {
-        continue;
-      } else if (__popc(ballot) == 1) {
-        if (active) {
-          temp_storage.partial_sums[NODE][warp_id] += gpair;
-        }
-      } else {
-        gpu_gpair sum = WarpReduceT(temp_storage.warp_reduce[warp_id])
-                            .Sum(active ? gpair : gpu_gpair());
-        if (lane_id == 0) {
-          temp_storage.partial_sums[NODE][warp_id] += sum;
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void ReducePartials() {
-    // Use single warp to reduce partials
-    if (threadIdx.x < 32) {
-      for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-        gpu_gpair sum =
-            WarpReduceT(temp_storage.warp_reduce[0])
-                .Sum(threadIdx.x < ParamsT::N_WARPS
-                         ? temp_storage.partial_sums[NODE][threadIdx.x]
-                         : gpu_gpair());
-
-        if (threadIdx.x == 0) {
-          reduction.node_sums[NODE] = sum;
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
-                                                const bst_uint &segment_end) {
-    // Current position
-    bst_uint offset = segment_begin;
-
-    ResetReduction();
-    ResetPartials();
-
-    __syncthreads();
-
-    // Process full tiles
-    while (offset < segment_end) {
-      ProcessTile(offset, segment_end - offset);
-      offset += ParamsT::TILE_ITEMS;
-    }
-
-    __syncthreads();
-
-    ReducePartials();
-
-    __syncthreads();
-  }
-};
-
-template <typename ParamsT, typename ReductionT>
-struct FindSplitEnactorMultiscan {
-  typedef cub::BlockScan<BitFlagSet, ParamsT::BLOCK_THREADS> FlagsBlockScanT;
-
-  typedef cub::WarpReduce<Split> WarpSplitReduceT;
-
-  typedef cub::WarpReduce<float> WarpReduceT;
-
-  typedef cub::WarpScan<gpu_gpair> WarpScanT;
-
-  struct _TempStorage {
-    union {
-      typename WarpSplitReduceT::TempStorage warp_split_reduce;
-      typename FlagsBlockScanT::TempStorage flags_scan;
-      typename WarpScanT::TempStorage warp_gpair_scan[ParamsT::N_WARPS];
-      typename WarpReduceT::TempStorage warp_reduce[ParamsT::N_WARPS];
-    };
-
-    Split warp_best_splits[ParamsT::N_NODES][ParamsT::N_WARPS];
-    gpu_gpair partial_sums[ParamsT::N_NODES][ParamsT::N_WARPS];
-    gpu_gpair top_level_sum[ParamsT::N_NODES];  // Sum of current partial sums
-    gpu_gpair tile_carry[ParamsT::N_NODES];    // Contains top-level sums from
-                                               // previous tiles
-    Split best_splits[ParamsT::N_NODES];
-    // Cache current level nodes into shared memory
-    float node_root_gain[ParamsT::N_NODES];
-    gpu_gpair node_parent_sum[ParamsT::N_NODES];
-  };
-
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-
-  // Thread local member variables
-  const ItemIter item_iter;
-  Split *d_split_candidates_out;
-  const Node *d_nodes;
-  _TempStorage &temp_storage;
-  gpu_gpair gpair;
-  float fvalue;
-  NodeIdT node_id;
-  NodeIdT node_id_adjusted;
-  const NodeIdT node_begin;
-  const GPUTrainingParam &param;
-  const ReductionT &reduction;
-  const int level;
-  FlagPrefixCallbackOp flag_prefix_op;
-
-  __device__ __forceinline__ FindSplitEnactorMultiscan(
-      TempStorage &temp_storage, const ItemIter item_iter, // NOLINT
-      Split *d_split_candidates_out, const Node *d_nodes,
-      const NodeIdT node_begin, const GPUTrainingParam &param,
-      const ReductionT reduction, const int level)
-      : temp_storage(temp_storage.Alias()), item_iter(item_iter),
-        d_split_candidates_out(d_split_candidates_out), d_nodes(d_nodes),
-        node_begin(node_begin), param(param), reduction(reduction),
-        level(level), flag_prefix_op() {}
-
-  __device__ __forceinline__ void UpdateTileCarry() {
-    if (threadIdx.x < ParamsT::N_NODES) {
-      temp_storage.tile_carry[threadIdx.x] +=
-          temp_storage.top_level_sum[threadIdx.x];
-    }
-  }
-
-  __device__ __forceinline__ void ResetTileCarry() {
-    if (threadIdx.x < ParamsT::N_NODES) {
-      temp_storage.tile_carry[threadIdx.x] = gpu_gpair();
-    }
-  }
-
-  __device__ __forceinline__ void ResetPartials() {
-    if (threadIdx.x < ParamsT::N_WARPS) {
-      for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-        temp_storage.partial_sums[NODE][threadIdx.x] = gpu_gpair();
-      }
-    }
-
-    if (threadIdx.x < ParamsT::N_NODES) {
-      temp_storage.top_level_sum[threadIdx.x] = gpu_gpair();
-    }
-  }
-
-  __device__ __forceinline__ void ResetSplits() {
-    if (threadIdx.x < ParamsT::N_WARPS) {
-      for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-        temp_storage.warp_best_splits[NODE][threadIdx.x] = Split();
-      }
-    }
-
-    if (threadIdx.x < ParamsT::N_NODES) {
-      temp_storage.best_splits[threadIdx.x] = Split();
-    }
-  }
-
-  // Cache d_nodes array for this level into shared memory
-  __device__ __forceinline__ void CacheNodes() {
-    // Get pointer to nodes on the current level
-    const Node *d_nodes_level = d_nodes + node_begin;
-
-    if (threadIdx.x < ParamsT::N_NODES) {
-      temp_storage.node_root_gain[threadIdx.x] =
-          d_nodes_level[threadIdx.x].root_gain;
-      temp_storage.node_parent_sum[threadIdx.x] =
-          d_nodes_level[threadIdx.x].sum_gradients;
-    }
-  }
-
-  __device__ __forceinline__ void LoadTile(bst_uint offset,
-                                           bst_uint num_remaining) {
-    if (threadIdx.x < num_remaining) {
-      bst_uint i = offset + threadIdx.x;
-      gpair = thrust::get<0>(item_iter[i]);
-      fvalue = thrust::get<1>(item_iter[i]);
-      node_id = thrust::get<2>(item_iter[i]);
-      node_id_adjusted = node_id - node_begin;
-    } else {
-      node_id = -1;
-      node_id_adjusted = -1;
-      fvalue = -FLT_MAX;
-      gpair = gpu_gpair();
-    }
-  }
-
-  // Is this node being processed by current kernel iteration?
-  __device__ __forceinline__ bool NodeActive() {
-    return node_id_adjusted < ParamsT::N_NODES && node_id_adjusted >= 0;
-  }
-
-  // Is current fvalue different from left fvalue
-  __device__ __forceinline__ bool
-  LeftMostFvalue(const bst_uint &segment_begin, const bst_uint &offset,
-                 const bst_uint &num_remaining) {
-    int left_index = offset + threadIdx.x - 1;
-    float left_fvalue = left_index >= static_cast<int>(segment_begin) &&
-                                threadIdx.x < num_remaining
-                            ? thrust::get<1>(item_iter[left_index])
-                            : -FLT_MAX;
-
-    return left_fvalue != fvalue;
-  }
-
-  // Prevent splitting in the middle of same valued instances
-  __device__ __forceinline__ bool
-  CheckSplitValid(const bst_uint &segment_begin, const bst_uint &offset,
-                  const bst_uint &num_remaining) {
-    BitFlagSet bit_flag = 0;
-
-    bool valid_split = false;
-
-    if (LeftMostFvalue(segment_begin, offset, num_remaining)) {
-      valid_split = true;
-      // Use MSB bit to flag if fvalue is leftmost
-      set_bit(bit_flag, 63);
-    }
-
-    // Flag nodeid
-    if (NodeActive()) {
-      set_bit(bit_flag, node_id_adjusted);
-    }
-
-    FlagsBlockScanT(temp_storage.flags_scan)
-        .ExclusiveScan(bit_flag, bit_flag, FlagScanOp(), flag_prefix_op);
-    __syncthreads();
-
-    if (!valid_split && NodeActive()) {
-      if (!check_bit(bit_flag, node_id_adjusted)) {
-        valid_split = true;
-      }
-    }
-
-    return valid_split;
-  }
-
-  // Perform warp reduction to find if this lane contains best loss_chg in warp
-  __device__ __forceinline__ bool QueryLaneBestLoss(const float &loss_chg) {
-    int lane_id = threadIdx.x % 32;
-    int warp_id = threadIdx.x / 32;
-
-    // Possible source of bugs. Not all threads in warp are active here. Not
-    // sure if reduce function will behave correctly
-    float best = WarpReduceT(temp_storage.warp_reduce[warp_id])
-                     .Reduce(loss_chg, cub::Max());
-
-    // Its possible for more than one lane to contain the best value, so make
-    // sure only one lane returns true
-    unsigned int ballot = __ballot(loss_chg == best);
-
-    if (lane_id == (__ffs(ballot) - 1)) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  // Which thread in this warp should update the current best split, if any
-  // Returns true for one thread or none
-  __device__ __forceinline__ bool
-  QueryUpdateWarpSplit(const float &loss_chg,
-                       volatile const float &warp_best_loss) {
-    bool update = false;
-
-    for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-      bool active = node_id_adjusted == NODE;
-
-      unsigned int ballot = __ballot(loss_chg > warp_best_loss && active);
-
-      // No lane has improved loss_chg
-      if (__popc(ballot) == 0) {
-        continue;
-      } else if (__popc(ballot) == 1) {
-        // A single lane has improved loss_chg, set true for this lane
-        int lane_id = threadIdx.x % 32;
-
-        if (lane_id == __ffs(ballot) - 1) {
-          update = true;
-        }
-      } else {
-        // More than one lane has improved loss_chg, perform a reduction.
-        if (QueryLaneBestLoss(active ? loss_chg : -FLT_MAX)) {
-          update = true;
-        }
-      }
-    }
-
-    return update;
-  }
-
-  __device__ void PrintTileScan(int block_id, bool thread_active,
-                                float loss_chg, gpu_gpair missing) {
-    if (blockIdx.x != block_id) {
-      return;
-    }
-
-    for (int warp = 0; warp < ParamsT::N_WARPS; warp++) {
-      if (threadIdx.x / 32 == warp) {
-        for (int lane = 0; lane < 32; lane++) {
-          gpu_gpair g = cub::ShuffleIndex(gpair, lane);
-          gpu_gpair missing_broadcast = cub::ShuffleIndex(missing, lane);
-          float fvalue_broadcast = __shfl(fvalue, lane);
-          bool thread_active_broadcast = __shfl(thread_active, lane);
-          float loss_chg_broadcast = __shfl(loss_chg, lane);
-          NodeIdT node_id_broadcast = cub::ShuffleIndex(node_id, lane);
-          if (threadIdx.x == 32 * warp) {
-            printf("tid %d, nid %d, fvalue %1.2f, active %c, loss %1.2f, scan ",
-                   threadIdx.x + lane, node_id_broadcast, fvalue_broadcast,
-                   thread_active_broadcast ? 'y' : 'n',
-                   loss_chg_broadcast < 0.0f ? 0 : loss_chg_broadcast);
-            g.print();
-          }
-        }
-      }
-
-      __syncthreads();
-    }
-  }
-
-  __device__ __forceinline__ void
-  EvaluateSplits(const bst_uint &segment_begin, const bst_uint &offset,
-                 const bst_uint &num_remaining) {
-    bool valid_split = CheckSplitValid(segment_begin, offset, num_remaining);
-
-    bool thread_active =
-        NodeActive() && valid_split && threadIdx.x < num_remaining;
-
-    const int warp_id = threadIdx.x / 32;
-
-    gpu_gpair parent_sum = thread_active
-                               ? temp_storage.node_parent_sum[node_id_adjusted]
-                               : gpu_gpair();
-    float parent_gain =
-        thread_active ? temp_storage.node_root_gain[node_id_adjusted] : 0.0f;
-    gpu_gpair missing = thread_active
-                            ? parent_sum - reduction.node_sums[node_id_adjusted]
-                            : gpu_gpair();
-
-    bool missing_left;
-
-    float loss_chg = thread_active
-                         ? loss_chg_missing(gpair, missing, parent_sum,
-                                            parent_gain, param, missing_left)
-                         : -FLT_MAX;
-
-    // PrintTileScan(64, thread_active, loss_chg, missing);
-
-    float warp_best_loss =
-        thread_active
-            ? temp_storage.warp_best_splits[node_id_adjusted][warp_id].loss_chg
-            : 0.0f;
-
-    if (QueryUpdateWarpSplit(loss_chg, warp_best_loss)) {
-      float fvalue_split = fvalue - FVALUE_EPS;
-
-      if (missing_left) {
-        gpu_gpair left_sum = missing + gpair;
-        gpu_gpair right_sum = parent_sum - left_sum;
-        temp_storage.warp_best_splits[node_id_adjusted][warp_id].Update(
-            loss_chg, missing_left, fvalue_split, blockIdx.x, left_sum,
-            right_sum, param);
-      } else {
-        gpu_gpair left_sum = gpair;
-        gpu_gpair right_sum = parent_sum - left_sum;
-        temp_storage.warp_best_splits[node_id_adjusted][warp_id].Update(
-            loss_chg, missing_left, fvalue_split, blockIdx.x, left_sum,
-            right_sum, param);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void BlockExclusiveScan() {
-    ResetPartials();
-
-    __syncthreads();
-    int warp_id = threadIdx.x / 32;
-    int lane_id = threadIdx.x % 32;
-
-    for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-      bool node_active = node_id_adjusted == NODE;
-
-      unsigned int ballot = __ballot(node_active);
-
-      gpu_gpair warp_sum = gpu_gpair();
-      gpu_gpair scan_result = gpu_gpair();
-
-      if (ballot > 0) {
-        WarpScanT(temp_storage.warp_gpair_scan[warp_id])
-            .InclusiveScan(node_active ? gpair : gpu_gpair(), scan_result,
-                           cub::Sum(), warp_sum);
-      }
-
-      if (node_active) {
-        gpair = scan_result - gpair;
-      }
-
-      if (lane_id == 0) {
-        temp_storage.partial_sums[NODE][warp_id] = warp_sum;
-      }
-    }
-
-    __syncthreads();
-
-    if (threadIdx.x < 32) {
-      for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-        gpu_gpair top_level_sum;
-        bool warp_active = threadIdx.x < ParamsT::N_WARPS;
-        gpu_gpair scan_result;
-        WarpScanT(temp_storage.warp_gpair_scan[warp_id])
-            .InclusiveScan(warp_active
-                               ? temp_storage.partial_sums[NODE][threadIdx.x]
-                               : gpu_gpair(),
-                           scan_result, cub::Sum(), top_level_sum);
-
-        if (warp_active) {
-          temp_storage.partial_sums[NODE][threadIdx.x] =
-              scan_result - temp_storage.partial_sums[NODE][threadIdx.x];
-        }
-
-        if (threadIdx.x == 0) {
-          temp_storage.top_level_sum[NODE] = top_level_sum;
-        }
-      }
-    }
-
-    __syncthreads();
-
-    if (NodeActive()) {
-      gpair += temp_storage.partial_sums[node_id_adjusted][warp_id] +
-               temp_storage.tile_carry[node_id_adjusted];
-    }
-
-    __syncthreads();
-
-    UpdateTileCarry();
-
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ void ProcessTile(const bst_uint &segment_begin,
-                                              const bst_uint &offset,
-                                              const bst_uint &num_remaining) {
-    LoadTile(offset, num_remaining);
-    BlockExclusiveScan();
-    EvaluateSplits(segment_begin, offset, num_remaining);
-  }
-
-  __device__ __forceinline__ void ReduceSplits() {
-    for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
-      if (threadIdx.x < 32) {
-        Split s = Split();
-        if (threadIdx.x < ParamsT::N_WARPS) {
-          s = temp_storage.warp_best_splits[NODE][threadIdx.x];
-        }
-        Split best = WarpSplitReduceT(temp_storage.warp_split_reduce)
-                         .Reduce(s, split_reduce_op());
-        if (threadIdx.x == 0) {
-          temp_storage.best_splits[NODE] = best;
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void WriteBestSplits() {
-    const int nodes_level = 1 << level;
-
-    if (threadIdx.x < ParamsT::N_NODES) {
-      d_split_candidates_out[blockIdx.x * nodes_level + threadIdx.x] =
-          temp_storage.best_splits[threadIdx.x];
-    }
-  }
-
-  __device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
-                                                const bst_uint &segment_end) {
-    // Current position
-    bst_uint offset = segment_begin;
-
-    ResetTileCarry();
-    ResetSplits();
-    CacheNodes();
-    __syncthreads();
-
-    // Process full tiles
-    while (offset < segment_end) {
-      ProcessTile(segment_begin, offset, segment_end - offset);
-      __syncthreads();
-      offset += ParamsT::TILE_ITEMS;
-    }
-
-    __syncthreads();
-    ReduceSplits();
-
-    __syncthreads();
-    WriteBestSplits();
-  }
-};
-
-template <typename FindSplitParamsT, typename ReduceParamsT>
-__global__ void
-#if __CUDA_ARCH__ <= 530
-__launch_bounds__(1024, 2)
-#endif
-    find_split_candidates_multiscan_kernel(
-        const ItemIter items_iter, Split *d_split_candidates_out,
-        const Node *d_nodes, const int node_begin, bst_uint num_items,
-        int num_features, const int *d_feature_offsets,
-        const GPUTrainingParam param, const int *d_feature_flags,
-        const int level) {
-  if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
-    return;
-  }
-
-  int segment_begin = d_feature_offsets[blockIdx.x];
-  int segment_end = d_feature_offsets[blockIdx.x + 1];
-
-  typedef ReduceEnactorMultiscan<ReduceParamsT> ReduceT;
-  typedef FindSplitEnactorMultiscan<FindSplitParamsT,
-                                    typename ReduceT::_Reduction>
-      FindSplitT;
-
-  __shared__ union {
-    typename ReduceT::TempStorage reduce;
-    typename FindSplitT::TempStorage find_split;
-  } temp_storage;
-
-  __shared__ typename ReduceT::Reduction reduction;
-
-  ReduceT(temp_storage.reduce, reduction, items_iter, node_begin)
-      .ProcessRegion(segment_begin, segment_end);
-  __syncthreads();
-
-  FindSplitT find_split(temp_storage.find_split, items_iter,
-                        d_split_candidates_out, d_nodes, node_begin, param,
-                        reduction.Alias(), level);
-  find_split.ProcessRegion(segment_begin, segment_end);
-}
-
-template <int N_NODES>
-void find_split_candidates_multiscan_variation(GPUData *data, const int level) {
-  const int node_begin = (1 << level) - 1;
-  const int BLOCK_THREADS = 512;
-
-  CHECK(BLOCK_THREADS / 32 < 32)
-      << "Too many active warps. See FindSplitEnactor - ReduceSplits.";
-
-  typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false>
-      find_split_params;
-  typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params;
-  int grid_size = data->n_features;
-
-  find_split_candidates_multiscan_kernel<
-      find_split_params,
-      reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>(
-      data->items_iter, data->split_candidates.data(), data->nodes.data(),
-      node_begin, data->fvalues.size(), data->n_features, data->foffsets.data(),
-      data->param, data->feature_flags.data(), level);
-
-  dh::safe_cuda(cudaDeviceSynchronize());
-}
-
-void find_split_candidates_multiscan(GPUData *data, const int level) {
-  // Select templated variation of split finding algorithm
-  switch (level) {
-  case 0:
-    find_split_candidates_multiscan_variation<1>(data, level);
-    break;
-  case 1:
-    find_split_candidates_multiscan_variation<2>(data, level);
-    break;
-  case 2:
-    find_split_candidates_multiscan_variation<4>(data, level);
-    break;
-  case 3:
-    find_split_candidates_multiscan_variation<8>(data, level);
-    break;
-  case 4:
-    find_split_candidates_multiscan_variation<16>(data, level);
-    break;
-  }
-}
-}  // namespace tree
-}  // namespace xgboost
--- a/plugin/updater_gpu/src/find_split_sorting.cuh
+++ b/plugin/updater_gpu/src/find_split_sorting.cuh
@ -1,414 +0,0 @@
-/*!
- * Copyright 2016 Rory mitchell
-*/
-#pragma once
-#include <xgboost/base.h>
-#include <cub/cub.cuh>
-#include "common.cuh"
-#include "device_helpers.cuh"
-#include "types.cuh"
-
-namespace xgboost {
-namespace tree {
-
-struct ScanTuple {
-  gpu_gpair gpair;
-  NodeIdT node_id;
-
-  __device__ ScanTuple() {}
-
-  __device__ ScanTuple(gpu_gpair gpair, NodeIdT node_id)
-      : gpair(gpair), node_id(node_id) {}
-
-  __device__ ScanTuple operator+=(const ScanTuple &rhs) {
-    if (node_id != rhs.node_id) {
-      *this = rhs;
-      return *this;
-    } else {
-      gpair += rhs.gpair;
-      return *this;
-    }
-  }
-  __device__ ScanTuple operator+(const ScanTuple &rhs) const {
-    ScanTuple t = *this;
-    return t += rhs;
-  }
-};
-
-struct GpairTupleCallbackOp {
-  // Running prefix
-  ScanTuple running_total;
-  // Constructor
-  __device__ GpairTupleCallbackOp()
-      : running_total(ScanTuple(gpu_gpair(), -1)) {}
-  __device__ ScanTuple operator()(ScanTuple block_aggregate) {
-    ScanTuple old_prefix = running_total;
-    running_total += block_aggregate;
-    return old_prefix;
-  }
-};
-
-template <int BLOCK_THREADS>
-struct ReduceEnactorSorting {
-  typedef cub::BlockScan<ScanTuple, BLOCK_THREADS> GpairScanT;
-  struct _TempStorage {
-    typename GpairScanT::TempStorage gpair_scan;
-  };
-
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-
-  // Thread local member variables
-  gpu_gpair *d_block_node_sums;
-  int *d_block_node_offsets;
-  const ItemIter item_iter;
-  _TempStorage &temp_storage;
-  gpu_gpair gpair;
-  NodeIdT node_id;
-  NodeIdT right_node_id;
-  // Contains node_id relative to the current level only
-  NodeIdT node_id_adjusted;
-  GpairTupleCallbackOp callback_op;
-  const int level;
-
-  __device__ __forceinline__
-  ReduceEnactorSorting(TempStorage &temp_storage,  // NOLINT
-                       gpu_gpair *d_block_node_sums, int *d_block_node_offsets,
-                       ItemIter item_iter, const int level)
-      : temp_storage(temp_storage.Alias()),
-        d_block_node_sums(d_block_node_sums),
-        d_block_node_offsets(d_block_node_offsets),
-        item_iter(item_iter),
-        callback_op(),
-        level(level) {}
-
-  __device__ __forceinline__ void LoadTile(const bst_uint &offset,
-                                           const bst_uint &num_remaining) {
-    if (threadIdx.x < num_remaining) {
-      bst_uint i = offset + threadIdx.x;
-      gpair = thrust::get<0>(item_iter[i]);
-      node_id = thrust::get<2>(item_iter[i]);
-      right_node_id = threadIdx.x == num_remaining - 1
-                          ? -1
-                          : thrust::get<2>(item_iter[i + 1]);
-      // Prevent overflow
-      const int level_begin = (1 << level) - 1;
-      node_id_adjusted =
-          max(static_cast<int>(node_id) - level_begin, -1);  // NOLINT
-    }
-  }
-
-  __device__ __forceinline__ void ProcessTile(const bst_uint &offset,
-                                              const bst_uint &num_remaining) {
-    LoadTile(offset, num_remaining);
-
-    ScanTuple t(gpair, node_id);
-    GpairScanT(temp_storage.gpair_scan).InclusiveSum(t, t, callback_op);
-    __syncthreads();
-
-    // If tail of segment
-    if (node_id != right_node_id && node_id_adjusted >= 0 &&
-        threadIdx.x < num_remaining) {
-      // Write sum
-      d_block_node_sums[node_id_adjusted] = t.gpair;
-      // Write offset
-      d_block_node_offsets[node_id_adjusted] = offset + threadIdx.x + 1;
-    }
-  }
-
-  __device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
-                                                const bst_uint &segment_end) {
-    const int max_nodes = 1 << level;
-    dh::block_fill(d_block_node_offsets, max_nodes, -1);
-    dh::block_fill(d_block_node_sums, max_nodes, gpu_gpair());
-
-    // Current position
-    bst_uint offset = segment_begin;
-
-    __syncthreads();
-
-    // Process full tiles
-    while (offset < segment_end) {
-      ProcessTile(offset, segment_end - offset);
-      offset += BLOCK_THREADS;
-    }
-  }
-};
-
-template <int BLOCK_THREADS, int N_WARPS = BLOCK_THREADS / 32>
-struct FindSplitEnactorSorting {
-  typedef cub::BlockScan<gpu_gpair, BLOCK_THREADS> GpairScanT;
-  typedef cub::BlockReduce<Split, BLOCK_THREADS> SplitReduceT;
-  typedef cub::WarpReduce<float> WarpLossReduceT;
-
-  struct _TempStorage {
-    union {
-      typename GpairScanT::TempStorage gpair_scan;
-      typename SplitReduceT::TempStorage split_reduce;
-      typename WarpLossReduceT::TempStorage loss_reduce[N_WARPS];
-    };
-    Split warp_best_splits[N_WARPS];
-  };
-
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-
-  // Thread local member variables
-  _TempStorage &temp_storage;
-  gpu_gpair *d_block_node_sums;
-  int *d_block_node_offsets;
-  const ItemIter item_iter;
-  const Node *d_nodes;
-  gpu_gpair gpair;
-  float fvalue;
-  NodeIdT node_id;
-  float left_fvalue;
-  const GPUTrainingParam &param;
-  Split *d_split_candidates_out;
-  const int level;
-
-  __device__ __forceinline__ FindSplitEnactorSorting(
-      TempStorage &temp_storage, gpu_gpair *d_block_node_sums,  // NOLINT
-      int *d_block_node_offsets, const ItemIter item_iter, const Node *d_nodes,
-      const GPUTrainingParam &param, Split *d_split_candidates_out,
-      const int level)
-      : temp_storage(temp_storage.Alias()),
-        d_block_node_sums(d_block_node_sums),
-        d_block_node_offsets(d_block_node_offsets),
-        item_iter(item_iter),
-        d_nodes(d_nodes),
-        d_split_candidates_out(d_split_candidates_out),
-        level(level),
-        param(param) {}
-
-  __device__ __forceinline__ void LoadTile(NodeIdT node_id_adjusted,
-                                           const bst_uint &node_begin,
-                                           const bst_uint &offset,
-                                           const bst_uint &num_remaining) {
-    if (threadIdx.x < num_remaining) {
-      bst_uint i = offset + threadIdx.x;
-      gpair = thrust::get<0>(item_iter[i]);
-      fvalue = thrust::get<1>(item_iter[i]);
-      node_id = thrust::get<2>(item_iter[i]);
-      bool first_item = offset + threadIdx.x == node_begin;
-      left_fvalue =
-          first_item ? fvalue - FVALUE_EPS : thrust::get<1>(item_iter[i - 1]);
-    }
-  }
-
-  __device__ void PrintTileScan(int block_id, bool thread_active,
-                                float loss_chg, gpu_gpair missing) {
-    if (blockIdx.x != block_id) {
-      return;
-    }
-
-    for (int warp = 0; warp < N_WARPS; warp++) {
-      if (threadIdx.x / 32 == warp) {
-        for (int lane = 0; lane < 32; lane++) {
-          gpu_gpair g = cub::ShuffleIndex(gpair, lane);
-          gpu_gpair missing_broadcast = cub::ShuffleIndex(missing, lane);
-          float fvalue_broadcast = __shfl(fvalue, lane);
-          bool thread_active_broadcast = __shfl(thread_active, lane);
-          float loss_chg_broadcast = __shfl(loss_chg, lane);
-          if (threadIdx.x == 32 * warp) {
-            printf("tid %d, fvalue %1.2f, active %c, loss %1.2f, scan ",
-                   threadIdx.x + lane, fvalue_broadcast,
-                   thread_active_broadcast ? 'y' : 'n',
-                   loss_chg_broadcast < 0.0f ? 0 : loss_chg_broadcast);
-            g.print();
-          }
-        }
-      }
-
-      __syncthreads();
-    }
-  }
-
-  __device__ __forceinline__ bool QueryUpdateWarpSplit(float loss_chg,
-                                                       float warp_best_loss,
-                                                       bool thread_active) {
-    int warp_id = threadIdx.x / 32;
-    int ballot = __ballot(loss_chg > warp_best_loss && thread_active);
-    if (ballot == 0) {
-      return false;
-    } else {
-      // Warp reduce best loss
-      float best = WarpLossReduceT(temp_storage.loss_reduce[warp_id])
-                       .Reduce(loss_chg, cub::Max());
-      // Broadcast
-      best = cub::ShuffleIndex(best, 0);
-
-      if (loss_chg == best) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  __device__ __forceinline__ bool LeftmostFvalue() {
-    return fvalue != left_fvalue;
-  }
-
-  __device__ __forceinline__ void EvaluateSplits(
-      const NodeIdT &node_id_adjusted, const bst_uint &node_begin,
-      const bst_uint &offset, const bst_uint &num_remaining) {
-    bool thread_active = LeftmostFvalue() && threadIdx.x < num_remaining &&
-                         node_id_adjusted >= 0 && node_id >= 0;
-
-    Node n = thread_active ? d_nodes[node_id] : Node();
-    gpu_gpair missing =
-        thread_active ? n.sum_gradients - d_block_node_sums[node_id_adjusted]
-                      : gpu_gpair();
-
-    bool missing_left;
-    float loss_chg = thread_active
-                         ? loss_chg_missing(gpair, missing, n.sum_gradients,
-                                            n.root_gain, param, missing_left)
-                         : -FLT_MAX;
-
-    int warp_id = threadIdx.x / 32;
-    volatile float warp_best_loss =
-        temp_storage.warp_best_splits[warp_id].loss_chg;
-
-    if (QueryUpdateWarpSplit(loss_chg, warp_best_loss, thread_active)) {
-      float fvalue_split = (fvalue + left_fvalue) / 2.0f;
-
-      gpu_gpair left_sum = gpair;
-      if (missing_left) {
-        left_sum += missing;
-      }
-      gpu_gpair right_sum = n.sum_gradients - left_sum;
-      temp_storage.warp_best_splits[warp_id].Update(loss_chg, missing_left,
-                                                    fvalue_split, blockIdx.x,
-                                                    left_sum, right_sum, param);
-    }
-  }
-
-  __device__ __forceinline__ void ProcessTile(
-      const NodeIdT &node_id_adjusted, const bst_uint &node_begin,
-      const bst_uint &offset, const bst_uint &num_remaining,
-      GpairCallbackOp &callback_op) {  // NOLINT
-    LoadTile(node_id_adjusted, node_begin, offset, num_remaining);
-
-    // Scan gpair
-    const bool thread_active = threadIdx.x < num_remaining && node_id >= 0;
-    GpairScanT(temp_storage.gpair_scan)
-        .ExclusiveSum(thread_active ? gpair : gpu_gpair(), gpair, callback_op);
-    __syncthreads();
-    // Evaluate split
-    EvaluateSplits(node_id_adjusted, node_begin, offset, num_remaining);
-  }
-
-  __device__ __forceinline__ void WriteBestSplit(
-      const NodeIdT &node_id_adjusted) {
-    if (threadIdx.x < 32) {
-      bool active = threadIdx.x < N_WARPS;
-      float warp_loss =
-          active ? temp_storage.warp_best_splits[threadIdx.x].loss_chg
-                 : -FLT_MAX;
-      if (QueryUpdateWarpSplit(warp_loss, 0, active)) {
-        const int max_nodes = 1 << level;
-        d_split_candidates_out[blockIdx.x * max_nodes + node_id_adjusted] =
-            temp_storage.warp_best_splits[threadIdx.x];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void ProcessNode(const NodeIdT &node_id_adjusted,
-                                              const bst_uint &node_begin,
-                                              const bst_uint &node_end) {
-    dh::block_fill(temp_storage.warp_best_splits, N_WARPS, Split());
-
-    GpairCallbackOp callback_op = GpairCallbackOp();
-
-    bst_uint offset = node_begin;
-
-    while (offset < node_end) {
-      ProcessTile(node_id_adjusted, node_begin, offset, node_end - offset,
-                  callback_op);
-      offset += BLOCK_THREADS;
-      __syncthreads();
-    }
-
-    WriteBestSplit(node_id_adjusted);
-  }
-
-  __device__ __forceinline__ void ProcessFeature(const bst_uint &segment_begin,
-                                                 const bst_uint &segment_end) {
-    int node_begin = segment_begin;
-
-    const int max_nodes = 1 << level;
-
-    // Iterate through nodes
-    int active_nodes = 0;
-    for (int i = 0; i < max_nodes; i++) {
-      int node_end = d_block_node_offsets[i];
-
-      if (node_end == -1) {
-        continue;
-      }
-
-      active_nodes++;
-
-      ProcessNode(i, node_begin, node_end);
-
-      __syncthreads();
-
-      node_begin = node_end;
-    }
-  }
-};
-
-template <int BLOCK_THREADS>
-__global__ __launch_bounds__(1024, 1) void find_split_candidates_sorted_kernel(
-    const ItemIter items_iter, Split *d_split_candidates_out,
-    const Node *d_nodes, bst_uint num_items, const int num_features,
-    const int *d_feature_offsets, gpu_gpair *d_node_sums, int *d_node_offsets,
-    const GPUTrainingParam param, const int *d_feature_flags, const int level) {
-  if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
-    return;
-  }
-
-  bst_uint segment_begin = d_feature_offsets[blockIdx.x];
-  bst_uint segment_end = d_feature_offsets[blockIdx.x + 1];
-
-  typedef ReduceEnactorSorting<BLOCK_THREADS> ReduceT;
-  typedef FindSplitEnactorSorting<BLOCK_THREADS> FindSplitT;
-
-  __shared__ union {
-    typename ReduceT::TempStorage reduce;
-    typename FindSplitT::TempStorage find_split;
-  } temp_storage;
-
-  const int max_modes_level = 1 << level;
-  gpu_gpair *d_block_node_sums = d_node_sums + blockIdx.x * max_modes_level;
-  int *d_block_node_offsets = d_node_offsets + blockIdx.x * max_modes_level;
-
-  ReduceT(temp_storage.reduce, d_block_node_sums, d_block_node_offsets,
-          items_iter, level)
-      .ProcessRegion(segment_begin, segment_end);
-  __syncthreads();
-
-  FindSplitT(temp_storage.find_split, d_block_node_sums, d_block_node_offsets,
-             items_iter, d_nodes, param, d_split_candidates_out, level)
-      .ProcessFeature(segment_begin, segment_end);
-}
-
-void find_split_candidates_sorted(GPUData *data, const int level) {
-  const int BLOCK_THREADS = 512;
-
-  CHECK(BLOCK_THREADS / 32 < 32) << "Too many active warps.";
-
-  int grid_size = data->n_features;
-
-  find_split_candidates_sorted_kernel<
-      BLOCK_THREADS><<<grid_size, BLOCK_THREADS>>>(
-      data->items_iter, data->split_candidates.data(), data->nodes.data(),
-      data->fvalues.size(), data->n_features, data->foffsets.data(),
-      data->node_sums.data(), data->node_offsets.data(), data->param,
-      data->feature_flags.data(), level);
-
-  dh::safe_cuda(cudaGetLastError());
-  dh::safe_cuda(cudaDeviceSynchronize());
-}
-}  // namespace tree
-}  // namespace xgboost
--- a/plugin/updater_gpu/src/gpu_builder.cu
+++ b/plugin/updater_gpu/src/gpu_builder.cu
@ -1,230 +0,0 @@
-/*!
- * Copyright 2016 Rory mitchell
- */
-#include "gpu_builder.cuh"
-#include <cub/cub.cuh>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <thrust/count.h>
-#include <thrust/device_vector.h>
-#include <thrust/gather.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <vector>
-#include "../../../src/common/random.h"
-#include "common.cuh"
-#include "device_helpers.cuh"
-#include "find_split.cuh"
-#include "gpu_data.cuh"
-#include "types.cuh"
-
-namespace xgboost {
-namespace tree {
-GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); }
-
-void GPUBuilder::Init(const TrainParam& param_in) {
-  param = param_in;
-  CHECK(param.max_depth < 16) << "Tree depth too large.";
-
-  dh::safe_cuda(cudaSetDevice(param.gpu_id));
-  if (!param.silent) {
-    LOG(CONSOLE) << "Device: [" << param.gpu_id << "] " << dh::device_name();
-  }
-}
-
-GPUBuilder::~GPUBuilder() { delete gpu_data; }
-
-void GPUBuilder::UpdateNodeId(int level) {
-  auto* d_node_id_instance = gpu_data->node_id_instance.data();
-  Node* d_nodes = gpu_data->nodes.data();
-
-  dh::launch_n(gpu_data->node_id_instance.size(), [=] __device__(int i) {
-    NodeIdT item_node_id = d_node_id_instance[i];
-
-    if (item_node_id < 0) {
-      return;
-    }
-
-    Node node = d_nodes[item_node_id];
-
-    if (node.IsLeaf()) {
-      d_node_id_instance[i] = -1;
-    } else if (node.split.missing_left) {
-      d_node_id_instance[i] = item_node_id * 2 + 1;
-    } else {
-      d_node_id_instance[i] = item_node_id * 2 + 2;
-    }
-  });
-
-  dh::safe_cuda(cudaDeviceSynchronize());
-
-  auto* d_fvalues = gpu_data->fvalues.data();
-  auto* d_instance_id = gpu_data->instance_id.data();
-  auto* d_node_id = gpu_data->node_id.data();
-  auto* d_feature_id = gpu_data->feature_id.data();
-
-  // Update node based on fvalue where exists
-  dh::launch_n(gpu_data->fvalues.size(), [=] __device__(int i) {
-    NodeIdT item_node_id = d_node_id[i];
-
-    if (item_node_id < 0) {
-      return;
-    }
-
-    Node node = d_nodes[item_node_id];
-
-    if (node.IsLeaf()) {
-      return;
-    }
-
-    int feature_id = d_feature_id[i];
-
-    if (feature_id == node.split.findex) {
-      float fvalue = d_fvalues[i];
-      bst_uint instance_id = d_instance_id[i];
-
-      if (fvalue < node.split.fvalue) {
-        d_node_id_instance[instance_id] = item_node_id * 2 + 1;
-      } else {
-        d_node_id_instance[instance_id] = item_node_id * 2 + 2;
-      }
-    }
-  });
-
-  dh::safe_cuda(cudaDeviceSynchronize());
-
-  gpu_data->GatherNodeId();
-}
-
-void GPUBuilder::Sort(int level) {
-  thrust::sequence(gpu_data->sort_index_in.tbegin(),
-                   gpu_data->sort_index_in.tend());
-
-  cub::DoubleBuffer<NodeIdT> d_keys(gpu_data->node_id.data(),
-                                    gpu_data->node_id_temp.data());
-  cub::DoubleBuffer<int> d_values(gpu_data->sort_index_in.data(),
-                                  gpu_data->sort_index_out.data());
-
-  size_t temp_size = gpu_data->cub_mem.size();
-
-  cub::DeviceSegmentedRadixSort::SortPairs(
-      gpu_data->cub_mem.data(), temp_size, d_keys, d_values,
-      gpu_data->fvalues.size(), gpu_data->n_features, gpu_data->foffsets.data(),
-      gpu_data->foffsets.data() + 1);
-
-  auto zip = thrust::make_zip_iterator(thrust::make_tuple(
-      gpu_data->fvalues.tbegin(), gpu_data->instance_id.tbegin()));
-  auto zip_temp = thrust::make_zip_iterator(thrust::make_tuple(
-      gpu_data->fvalues_temp.tbegin(), gpu_data->instance_id_temp.tbegin()));
-  thrust::gather(thrust::device_pointer_cast(d_values.Current()),
-                 thrust::device_pointer_cast(d_values.Current()) +
-                     gpu_data->sort_index_out.size(),
-                 zip, zip_temp);
-  thrust::copy(zip_temp, zip_temp + gpu_data->fvalues.size(), zip);
-
-  if (d_keys.Current() == gpu_data->node_id_temp.data()) {
-    thrust::copy(gpu_data->node_id_temp.tbegin(), gpu_data->node_id_temp.tend(),
-                 gpu_data->node_id.tbegin());
-  }
-}
-
-void GPUBuilder::ColsampleTree() {
-  unsigned n =
-      static_cast<unsigned>(param.colsample_bytree * gpu_data->n_features);
-  CHECK_GT(n, 0);
-
-  feature_set_tree.resize(gpu_data->n_features);
-  std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
-  std::shuffle(feature_set_tree.begin(), feature_set_tree.end(),
-               common::GlobalRandom());
-}
-
-void GPUBuilder::Update(const std::vector<bst_gpair>& gpair, DMatrix* p_fmat,
-                        RegTree* p_tree) {
-  this->InitData(gpair, *p_fmat, *p_tree);
-  this->InitFirstNode();
-  this->ColsampleTree();
-
-  for (int level = 0; level < param.max_depth; level++) {
-    bool use_multiscan_algorithm = level < multiscan_levels;
-
-    if (level > 0) {
-      this->UpdateNodeId(level);
-    }
-
-    if (level > 0 && !use_multiscan_algorithm) {
-      this->Sort(level);
-    }
-
-    find_split(gpu_data, param, level, use_multiscan_algorithm,
-               feature_set_tree, &feature_set_level);
-  }
-
-  dense2sparse_tree(p_tree, gpu_data->nodes.tbegin(), gpu_data->nodes.tend(),
-                    param);
-}
-
-void GPUBuilder::InitData(const std::vector<bst_gpair>& gpair, DMatrix& fmat,
-                          const RegTree& tree) {
-  CHECK(fmat.SingleColBlock()) << "grow_gpu: must have single column block. "
-                                  "Try setting 'tree_method' parameter to "
-                                  "'exact'";
-
-  if (gpu_data->IsAllocated()) {
-    gpu_data->Reset(gpair, param.subsample);
-    return;
-  }
-
-  MetaInfo info = fmat.info();
-
-  std::vector<int> foffsets;
-  foffsets.push_back(0);
-  std::vector<int> feature_id;
-  std::vector<float> fvalues;
-  std::vector<bst_uint> instance_id;
-  fvalues.reserve(info.num_col * info.num_row);
-  instance_id.reserve(info.num_col * info.num_row);
-  feature_id.reserve(info.num_col * info.num_row);
-
-  dmlc::DataIter<ColBatch>* iter = fmat.ColIterator();
-
-  while (iter->Next()) {
-    const ColBatch& batch = iter->Value();
-
-    for (int i = 0; i < batch.size; i++) {
-      const ColBatch::Inst& col = batch[i];
-
-      for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
-           it++) {
-        bst_uint inst_id = it->index;
-        fvalues.push_back(it->fvalue);
-        instance_id.push_back(inst_id);
-        feature_id.push_back(i);
-      }
-      foffsets.push_back(fvalues.size());
-    }
-  }
-
-  gpu_data->Init(fvalues, foffsets, instance_id, feature_id, gpair,
-                 info.num_row, info.num_col, param.max_depth, param);
-}
-
-void GPUBuilder::InitFirstNode() {
-  // Build the root node on the CPU and copy to device
-  gpu_gpair sum_gradients =
-      thrust::reduce(gpu_data->gpair.tbegin(), gpu_data->gpair.tend(),
-                     gpu_gpair(0, 0), cub::Sum());
-
-  Node tmp = Node(
-      sum_gradients,
-      CalcGain(gpu_data->param, sum_gradients.grad(), sum_gradients.hess()),
-      CalcWeight(gpu_data->param, sum_gradients.grad(), sum_gradients.hess()));
-
-  thrust::copy_n(&tmp, 1, gpu_data->nodes.tbegin());
-}
-}  // namespace tree
-}  // namespace xgboost
--- a/plugin/updater_gpu/src/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/gpu_builder.cuh
@ -1,46 +0,0 @@
-/*!
- * Copyright 2016 Rory mitchell
- */
-#pragma once
-#include <xgboost/tree_updater.h>
-#include <vector>
-#include "../../src/tree/param.h"
-
-namespace xgboost {
-
-namespace tree {
-
-struct gpu_gpair;
-struct GPUData;
-
-class GPUBuilder {
- public:
-  GPUBuilder();
-  void Init(const TrainParam &param);
-  ~GPUBuilder();
-
-  void UpdateParam(const TrainParam &param) { this->param = param; }
-
-  void Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
-              RegTree *p_tree);
-
-  void UpdateNodeId(int level);
-
- private:
-  void InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,  // NOLINT
-                const RegTree &tree);
-
-  void Sort(int level);
-  void InitFirstNode();
-  void ColsampleTree();
-
-  TrainParam param;
-  GPUData *gpu_data;
-  std::vector<int> feature_set_tree;
-  std::vector<int> feature_set_level;
-
-  int multiscan_levels =
-      5;  // Number of levels before switching to sorting algorithm
-};
-}  // namespace tree
-}  // namespace xgboost
--- a/plugin/updater_gpu/src/updater_gpu.cu
+++ b/plugin/updater_gpu/src/updater_gpu.cu
@ -3,10 +3,10 @@
 */
 #include <xgboost/tree_updater.h>
 #include <vector>
-#include "../../src/common/random.h"
-#include "../../src/common/sync.h"
-#include "../../src/tree/param.h"
-#include "gpu_builder.cuh"
+#include "../../../src/common/random.h"
+#include "../../../src/common/sync.h"
+#include "../../../src/tree/param.h"
+#include "exact/gpu_builder.cuh"
 #include "gpu_hist_builder.cuh"

 namespace xgboost {
@ -45,7 +45,7 @@ class GPUMaker : public TreeUpdater {
 protected:
  // training parameter
  TrainParam param;
-  GPUBuilder builder;
+  exact::GPUBuilder<int16_t> builder;
 };

 template <typename TStats>
--- a/plugin/updater_gpu/test/cpp/argmax_by_key.cu
+++ b/plugin/updater_gpu/test/cpp/argmax_by_key.cu
@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "gtest/gtest.h"
+#include "../../src/exact/argmax_by_key.cuh"
+#include "../../src/exact/gradients.cuh"
+#include "../../src/exact/node.cuh"
+#include "../../src/exact/loss_functions.cuh"
+#include "utils.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+TEST(ArgMaxByKey, maxSplit) {
+  Split a, b, out;
+  a.score = 2.f;
+  a.index = 3;
+  b.score = 3.f;
+  b.index = 4;
+  out = maxSplit(a, b);
+  EXPECT_FLOAT_EQ(out.score, b.score);
+  EXPECT_EQ(out.index, b.index);
+  b.score = 2.f;
+  b.index = 4;
+  out = maxSplit(a, b);
+  EXPECT_FLOAT_EQ(out.score, a.score);
+  EXPECT_EQ(out.index, a.index);
+  b.score = 2.f;
+  b.index = 2;
+  out = maxSplit(a, b);
+  EXPECT_FLOAT_EQ(out.score, a.score);
+  EXPECT_EQ(out.index, b.index);
+  b.score = 1.f;
+  b.index = 1;
+  out = maxSplit(a, b);
+  EXPECT_FLOAT_EQ(out.score, a.score);
+  EXPECT_EQ(out.index, a.index);
+}
+
+template <typename node_id_t>
+void argMaxTest(ArgMaxByKeyAlgo algo) {
+  const int nVals = 1024;
+  const int level = 0;
+  const int nKeys = 1 << level;
+  gpu_gpair* scans = new gpu_gpair[nVals];
+  float* vals = new float[nVals];
+  int* colIds = new int[nVals];
+  scans[0] = gpu_gpair();
+  vals[0] = 0.f;
+  colIds[0] = 0;
+  for (int i = 1; i < nVals; ++i) {
+    scans[i].g = scans[i-1].g + (0.1f * 2.f);
+    scans[i].h = scans[i-1].h + (0.1f * 2.f);
+    vals[i] = static_cast<float>(i) * 0.1f;
+    colIds[i] = 0;
+  }
+  float* dVals;
+  allocateAndUpdateOnGpu<float>(dVals, vals, nVals);
+  gpu_gpair* dScans;
+  allocateAndUpdateOnGpu<gpu_gpair>(dScans, scans, nVals);
+  gpu_gpair* sums = new gpu_gpair[nKeys];
+  sums[0].g = sums[0].h = (0.1f * 2.f * nVals);
+  gpu_gpair* dSums;
+  allocateAndUpdateOnGpu<gpu_gpair>(dSums, sums, nKeys);
+  int* dColIds;
+  allocateAndUpdateOnGpu<int>(dColIds, colIds, nVals);
+  Split* splits = new Split[nKeys];
+  Split* dSplits;
+  allocateOnGpu<Split>(dSplits, nKeys);
+  node_id_t* nodeAssigns = new node_id_t[nVals];
+  memset(nodeAssigns, 0, sizeof(node_id_t)*nVals);
+  node_id_t* dNodeAssigns;
+  allocateAndUpdateOnGpu<node_id_t>(dNodeAssigns, nodeAssigns, nVals);
+  Node<node_id_t>* nodes = new Node<node_id_t>[nKeys];
+  nodes[0].gradSum = sums[0];
+  nodes[0].id = 0;
+  TrainParam param;
+  param.min_child_weight = 0.0f;
+  param.reg_alpha = 0.f;
+  param.reg_lambda = 2.f;
+  param.max_delta_step = 0.f;
+  nodes[0].score = CalcGain(param, sums[0].g, sums[0].h);
+  Node<node_id_t>* dNodes;
+  allocateAndUpdateOnGpu<Node<node_id_t> >(dNodes, nodes, nKeys);
+  argMaxByKey<node_id_t>(dSplits, dScans, dSums, dVals, dColIds, dNodeAssigns,
+                         dNodes, nKeys, 0, nVals, param, algo);
+  updateHostPtr<Split>(splits, dSplits, nKeys);
+  EXPECT_FLOAT_EQ(0.f, splits->score);
+  EXPECT_EQ(0, splits->index);
+  dh::safe_cuda(cudaFree(dNodeAssigns));
+  delete [] nodeAssigns;
+  dh::safe_cuda(cudaFree(dSplits));
+  delete [] splits;
+  dh::safe_cuda(cudaFree(dColIds));
+  delete [] colIds;
+  dh::safe_cuda(cudaFree(dSums));
+  delete [] sums;
+  dh::safe_cuda(cudaFree(dVals));
+  delete [] vals;
+  dh::safe_cuda(cudaFree(dScans));
+  delete [] scans;
+}
+
+TEST(ArgMaxByKey, testOneColGmem) {
+  argMaxTest<int16_t>(ABK_GMEM);
+}
+
+TEST(ArgMaxByKey, testOneColSmem) {
+  argMaxTest<int16_t>(ABK_SMEM);
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/test/cpp/fused_reduce_scan_by_key.cu
+++ b/plugin/updater_gpu/test/cpp/fused_reduce_scan_by_key.cu
@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "gtest/gtest.h"
+#include "../../src/exact/fused_scan_reduce_by_key.cuh"
+#include "../../src/exact/node.cuh"
+#include "utils.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+template <typename node_id_t>
+class ReduceScanByKey: public Generator<node_id_t> {
+ public:
+  ReduceScanByKey(int nc, int nr, int nk, const std::string& tName):
+    Generator<node_id_t>(nc, nr, nk, tName),
+    hSums(nullptr), dSums(nullptr), hScans(nullptr), dScans(nullptr),
+    outSize(this->size), nSegments(this->nKeys*this->nCols),
+    hOffsets(nullptr), dOffsets(nullptr) {
+    hSums = new gpu_gpair[nSegments];
+    allocateOnGpu<gpu_gpair>(dSums, nSegments);
+    hScans = new gpu_gpair[outSize];
+    allocateOnGpu<gpu_gpair>(dScans, outSize);
+    gpu_gpair* buckets = new gpu_gpair[nSegments];
+    for (int i = 0; i < nSegments; i++) {
+      buckets[i] = gpu_gpair();
+    }
+    for (int i = 0; i < nSegments; i++) {
+      hSums[i] = gpu_gpair();
+    }
+    for (size_t i = 0; i < this->size; i++) {
+      if (this->hKeys[i] >= 0 && this->hKeys[i] < nSegments) {
+        node_id_t key = abs2uniqKey<node_id_t>(i, this->hKeys,
+                                               this->hColIds, 0,
+                                               this->nKeys);
+        hSums[key] += this->hVals[i];
+      }
+    }
+    for (int i = 0; i < this->size; ++i) {
+      node_id_t key = abs2uniqKey<node_id_t>(i, this->hKeys,
+                                             this->hColIds, 0,
+                                             this->nKeys);
+      hScans[i] = buckets[key];
+      buckets[key] += this->hVals[i];
+    }
+    // it's a dense matrix that we are currently looking at, so offsets
+    // are nicely aligned! (need not be the case in real datasets)
+    hOffsets = new int[this->nCols];
+    size_t off = 0;
+    for (int i = 0; i < this->nCols; ++i, off+=this->nRows) {
+      hOffsets[i] = off;
+    }
+    allocateAndUpdateOnGpu<int>(dOffsets, hOffsets, this->nCols);
+  }
+
+  ~ReduceScanByKey() {
+    delete [] hScans;
+    delete [] hSums;
+    delete [] hOffsets;
+    dh::safe_cuda(cudaFree(dScans));
+    dh::safe_cuda(cudaFree(dSums));
+    dh::safe_cuda(cudaFree(dOffsets));
+  }
+
+  void run() {
+    gpu_gpair* tmpScans;
+    int* tmpKeys;
+    int tmpSize = scanTempBufferSize(this->size);
+    allocateOnGpu<gpu_gpair>(tmpScans, tmpSize);
+    allocateOnGpu<int>(tmpKeys, tmpSize);
+    TIMEIT(reduceScanByKey<node_id_t>
+           (dSums, dScans, this->dVals, this->dInstIds, this->dKeys,
+            this->size, this->nKeys, this->nCols, tmpScans, tmpKeys,
+            this->dColIds, 0),
+           this->testName);
+    dh::safe_cuda(cudaFree(tmpScans));
+    dh::safe_cuda(cudaFree(tmpKeys));
+    this->compare(hSums, dSums, nSegments);
+    this->compare(hScans, dScans, outSize);
+  }
+
+ private:
+  gpu_gpair* hSums;
+  gpu_gpair* dSums;
+  gpu_gpair* hScans;
+  gpu_gpair* dScans;
+  int outSize;
+  int nSegments;
+  int* hOffsets;
+  int* dOffsets;
+};
+
+TEST(ReduceScanByKey, testInt16) {
+  ReduceScanByKey<int16_t>(32, 512, 32, "ReduceScanByKey").run();
+}
+
+TEST(ReduceScanByKey, testInt32) {
+  ReduceScanByKey<int>(32, 512, 32, "ReduceScanByKey").run();
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/test/cpp/generate_data.sh
+++ b/plugin/updater_gpu/test/cpp/generate_data.sh
--- a/plugin/updater_gpu/test/cpp/gpu_builder.cu
+++ b/plugin/updater_gpu/test/cpp/gpu_builder.cu
@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "gtest/gtest.h"
+#include "utils.cuh"
+#include "../../src/exact/gpu_builder.cuh"
+#include "../../src/exact/node.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+static const std::vector<int> smallColSizes = {0, 5, 0, 6, 4, 0, 0, 2, 0, 11,
+                                               2, 9, 0, 5, 1, 0, 12, 3};
+
+template <typename node_id_t>
+void testSmallData() {
+  GPUBuilder<node_id_t> builder;
+  std::shared_ptr<DMatrix> dm =
+      setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
+                              builder, 1);
+  // data dimensions
+  ASSERT_EQ(60, builder.nVals);
+  ASSERT_EQ(15, builder.nRows);
+  ASSERT_EQ(18, builder.nCols);
+  ASSERT_TRUE(builder.allocated);
+  // column counts
+  int* tmpOff = new int[builder.nCols+1];
+  updateHostPtr<int>(tmpOff, builder.colOffsets.data(), builder.nCols+1);
+  for (int i = 0; i < 15; ++i) {
+    EXPECT_EQ(smallColSizes[i], tmpOff[i+1]-tmpOff[i]);
+  }
+  float* tmpVal = new float[builder.nVals];
+  updateHostPtr<float>(tmpVal, builder.vals.current(), builder.nVals);
+  int* tmpInst = new int[builder.nVals];
+  updateHostPtr<int>(tmpInst, builder.instIds.current(), builder.nVals);
+  gpu_gpair* tmpGrad = new gpu_gpair[builder.nRows];
+  updateHostPtr<gpu_gpair>(tmpGrad, builder.gradsInst.data(), builder.nRows);
+  EXPECT_EQ(0, tmpInst[0]);
+  EXPECT_FLOAT_EQ(1.f, tmpVal[0]);
+  EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).g);
+  EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).h);
+  EXPECT_EQ(2, tmpInst[1]);
+  EXPECT_FLOAT_EQ(1.f, tmpVal[1]);
+  EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).g);
+  EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).h);
+  EXPECT_EQ(7, tmpInst[2]);
+  EXPECT_FLOAT_EQ(1.f, tmpVal[2]);
+  EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).g);
+  EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).h);
+  delete [] tmpGrad;
+  delete [] tmpOff;
+  delete [] tmpInst;
+  delete [] tmpVal;
+  int* colIds = new int[builder.nVals];
+  updateHostPtr<int>(colIds, builder.colIds.data(), builder.nVals);
+  std::vector<int> colSizeCopy(smallColSizes);
+  int colIdxCurr = 0;
+  for (int i = 0; i < builder.nVals; ++i) {
+    while (colSizeCopy[colIdxCurr] == 0) {
+      ++colIdxCurr;
+    }
+    --colSizeCopy[colIdxCurr];
+    EXPECT_EQ(colIdxCurr, colIds[i]);
+  }
+  delete [] colIds;
+}
+
+TEST(CudaGPUBuilderTest, SetupOneTimeDataSmallInt16) {
+  testSmallData<int16_t>();
+}
+
+TEST(CudaGPUBuilderTest, SetupOneTimeDataSmallInt32) {
+  testSmallData<int>();
+}
+
+template <typename node_id_t>
+void testLargeData() {
+  GPUBuilder<node_id_t> builder;
+  std::shared_ptr<DMatrix> dm =
+      setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/sample.libsvm",
+                              builder, 1);
+  ASSERT_EQ(35442, builder.nVals);
+  ASSERT_EQ(1611, builder.nRows);
+  ASSERT_EQ(127, builder.nCols);
+  ASSERT_TRUE(builder.allocated);
+  int* tmpOff = new int[builder.nCols+1];
+  updateHostPtr<int>(tmpOff, builder.colOffsets.data(), builder.nCols+1);
+  EXPECT_EQ(0, tmpOff[1]-tmpOff[0]);   // 1st col
+  EXPECT_EQ(83, tmpOff[2]-tmpOff[1]);  // 2nd col
+  EXPECT_EQ(1, tmpOff[3]-tmpOff[2]);   // 3rd col
+  float* tmpVal = new float[builder.nVals];
+  updateHostPtr<float>(tmpVal, builder.vals.current(), builder.nVals);
+  int* tmpInst = new int[builder.nVals];
+  updateHostPtr<int>(tmpInst, builder.instIds.current(), builder.nVals);
+  gpu_gpair* tmpGrad = new gpu_gpair[builder.nRows];
+  updateHostPtr<gpu_gpair>(tmpGrad, builder.gradsInst.data(), builder.nRows);
+  // the order of observations is messed up before the convertToCsc call!
+  // hence, the instance IDs have been manually checked and put here.
+  EXPECT_EQ(1164, tmpInst[0]);
+  EXPECT_FLOAT_EQ(1.f, tmpVal[0]);
+  EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).g);
+  EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).h);
+  EXPECT_EQ(1435, tmpInst[1]);
+  EXPECT_FLOAT_EQ(1.f, tmpVal[1]);
+  EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).g);
+  EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).h);
+  EXPECT_EQ(1421, tmpInst[2]);
+  EXPECT_FLOAT_EQ(1.f, tmpVal[2]);
+  EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).g);
+  EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).h);
+  delete [] tmpGrad;
+  delete [] tmpOff;
+  delete [] tmpInst;
+  delete [] tmpVal;
+}
+
+TEST(CudaGPUBuilderTest, SetupOneTimeDataLargeInt16) {
+  testLargeData<int16_t>();
+}
+
+TEST(CudaGPUBuilderTest, SetupOneTimeDataLargeInt32) {
+  testLargeData<int>();
+}
+
+int getColId(int* offsets, int id, int nCols) {
+  for (int i = 1; i <= nCols; ++i) {
+    if (id < offsets[i]) {
+      return (i-1);
+    }
+  }
+  return -1;
+}
+
+template <typename node_id_t>
+void testAllocate() {
+  GPUBuilder<node_id_t> builder;
+  std::shared_ptr<DMatrix> dm =
+      setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
+                              builder, 1);
+  ASSERT_EQ(3, builder.maxNodes);
+  ASSERT_EQ(2, builder.maxLeaves);
+  Node<node_id_t>* n = new Node<node_id_t>[builder.maxNodes];
+  updateHostPtr<Node<node_id_t> >(n, builder.nodes.data(), builder.maxNodes);
+  for (int i = 0; i < builder.maxNodes; ++i) {
+    if (i == 0) {
+      EXPECT_FALSE(n[i].isLeaf());
+      EXPECT_FALSE(n[i].isUnused());
+    } else {
+      EXPECT_TRUE(n[i].isLeaf());
+      EXPECT_FALSE(n[i].isUnused());
+    }
+  }
+  gpu_gpair sum;
+  sum.g = 0.f;
+  sum.h = 0.f;
+  for (int i = 0; i < builder.maxNodes; ++i) {
+    if (!n[i].isUnused()) {
+      sum += n[i].gradSum;
+    }
+  }
+  // law of conservation of gradients! :)
+  EXPECT_FLOAT_EQ(2.f*n[0].gradSum.g, sum.g);
+  EXPECT_FLOAT_EQ(2.f*n[0].gradSum.h, sum.h);
+  node_id_t* assigns = new node_id_t[builder.nVals];
+  int* offsets = new int[builder.nCols+1];
+  updateHostPtr<node_id_t>(assigns, builder.nodeAssigns.current(),
+                           builder.nVals);
+  updateHostPtr<int>(offsets, builder.colOffsets.data(), builder.nCols+1);
+  for (int i = 0; i < builder.nVals; ++i) {
+    EXPECT_EQ((node_id_t)0, assigns[i]);
+  }
+  delete [] n;
+  delete [] assigns;
+  delete [] offsets;
+}
+
+TEST(CudaGPUBuilderTest, AllocateNodeDataInt16) {
+  testAllocate<int16_t>();
+}
+
+TEST(CudaGPUBuilderTest, AllocateNodeDataInt32) {
+  testAllocate<int>();
+}
+
+template <typename node_id_t>
+void assign(Node<node_id_t> *n, float g, float h, float sc, float wt,
+            DefaultDirection d, float th, int c, int i) {
+  n->gradSum.g = g;
+  n->gradSum.h = h;
+  n->score = sc;
+  n->weight = wt;
+  n->dir = d;
+  n->threshold = th;
+  n->colIdx = c;
+  n->id = (node_id_t)i;
+}
+
+template <typename node_id_t>
+void testMarkLeaves() {
+  GPUBuilder<node_id_t> builder;
+  std::shared_ptr<DMatrix> dm =
+      setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
+                                             builder, 3);
+  ASSERT_EQ(15, builder.maxNodes);
+  ASSERT_EQ(8, builder.maxLeaves);
+  Node<node_id_t>* hNodes = new Node<node_id_t>[builder.maxNodes];
+  assign<node_id_t>(&hNodes[0], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.25f, 0, 0);
+  assign<node_id_t>(&hNodes[1], 2.f, 1.f, .75f, 0.5f, RightDir, 0.5f, 1, 1);
+  assign<node_id_t>(&hNodes[2], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.75f, 2, 2);
+  assign<node_id_t>(&hNodes[3], 2.f, 1.f, .75f, 0.5f, RightDir, 1.f, 3, 3);
+  assign<node_id_t>(&hNodes[4], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.25f, 4, 4);
+  hNodes[5] = Node<node_id_t>();
+  assign<node_id_t>(&hNodes[6], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.75f, 6, 6);
+  hNodes[7] = Node<node_id_t>();
+  hNodes[8] = Node<node_id_t>();
+  hNodes[9] = Node<node_id_t>();
+  hNodes[10] = Node<node_id_t>();
+  hNodes[11] = Node<node_id_t>();
+  hNodes[12] = Node<node_id_t>();
+  hNodes[13] = Node<node_id_t>();
+  hNodes[14] = Node<node_id_t>();
+  updateDevicePtr<Node<node_id_t> >(builder.nodes.data(), hNodes, builder.maxNodes);
+  builder.markLeaves();
+  Node<node_id_t>* outNodes = new Node<node_id_t>[builder.maxNodes];
+  updateHostPtr<Node<node_id_t> >(outNodes, builder.nodes.data(), builder.maxNodes);
+  for (int i = 0; i < builder.maxNodes; ++i) {
+    if ((i >= 7) || (i == 5)) {
+      EXPECT_TRUE(outNodes[i].isUnused());
+    } else {
+      EXPECT_FALSE(outNodes[i].isUnused());
+    }
+  }
+  for (int i = 0; i < builder.maxNodes; ++i) {
+    if ((i == 3) || (i == 4) || (i == 6)) {
+      EXPECT_TRUE(outNodes[i].isLeaf());
+    } else {
+      EXPECT_FALSE(outNodes[i].isLeaf());
+    }
+  }
+  delete [] outNodes;
+  delete [] hNodes;
+}
+
+TEST(CudaGPUBuilderTest, MarkLeavesInt16) {
+  testMarkLeaves<int16_t>();
+}
+
+TEST(CudaGPUBuilderTest, MarkLeavesInt32) {
+  testMarkLeaves<int>();
+}
+
+template <typename node_id_t>
+void testDense2Sparse() {
+  GPUBuilder<node_id_t> builder;
+  std::shared_ptr<DMatrix> dm =
+      setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
+                              builder, 3);
+  ASSERT_EQ(15, builder.maxNodes);
+  ASSERT_EQ(8, builder.maxLeaves);
+  Node<node_id_t>* hNodes = new Node<node_id_t>[builder.maxNodes];
+  assign<node_id_t>(&hNodes[0], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.25f, 0, 0);
+  assign<node_id_t>(&hNodes[1], 2.f, 1.f, .75f, 0.5f, RightDir, 0.5f, 1, 1);
+  assign<node_id_t>(&hNodes[2], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.75f, 2, 2);
+  assign<node_id_t>(&hNodes[3], 2.f, 1.f, .75f, 0.5f, RightDir, 1.f, 3, 3);
+  assign<node_id_t>(&hNodes[4], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.25f, 4, 4);
+  hNodes[5] = Node<node_id_t>();
+  assign<node_id_t>(&hNodes[6], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.75f, 6, 6);
+  assign<node_id_t>(&hNodes[7], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.75f, 7, 7);
+  hNodes[8] = Node<node_id_t>();
+  hNodes[9] = Node<node_id_t>();
+  hNodes[10] = Node<node_id_t>();
+  hNodes[11] = Node<node_id_t>();
+  hNodes[12] = Node<node_id_t>();
+  hNodes[13] = Node<node_id_t>();
+  hNodes[14] = Node<node_id_t>();
+  updateDevicePtr<Node<node_id_t> >(builder.nodes.data(), hNodes, builder.maxNodes);
+  builder.markLeaves();
+  RegTree tree;
+  builder.dense2sparse(tree);
+  EXPECT_EQ(9, tree.param.num_nodes);
+  delete [] hNodes;
+}
+
+TEST(CudaGPUBuilderTest, Dense2SparseInt16) {
+  testDense2Sparse<int16_t>();
+}
+
+TEST(CudaGPUBuilderTest, Dense2SparseInt32) {
+  testDense2Sparse<int>();
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/test/cpp/node.cu
+++ b/plugin/updater_gpu/test/cpp/node.cu
@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "gtest/gtest.h"
+#include "../../src/exact/node.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+TEST(Split, Test) {
+  Split s;
+  EXPECT_FALSE(s.isSplittable(0.5f));
+  s.score = 1.f;
+  EXPECT_FALSE(s.isSplittable(0.5f));
+  s.index = 2;
+  EXPECT_TRUE(s.isSplittable(0.5f));
+  EXPECT_FALSE(s.isSplittable(1.5f));
+}
+
+TEST(Node, Test) {
+  Node<int16_t> n;
+  EXPECT_TRUE(n.isUnused());
+  EXPECT_FALSE(n.isLeaf());
+  EXPECT_TRUE(n.isDefaultLeft());
+  n.dir = RightDir;
+  EXPECT_TRUE(n.isUnused());
+  EXPECT_FALSE(n.isLeaf());
+  EXPECT_FALSE(n.isDefaultLeft());
+  n.id = 123;
+  EXPECT_FALSE(n.isUnused());
+  EXPECT_TRUE(n.isLeaf());
+  EXPECT_FALSE(n.isDefaultLeft());
+  n.score = 0.5f;
+  EXPECT_FALSE(n.isUnused());
+  EXPECT_FALSE(n.isLeaf());
+  EXPECT_FALSE(n.isDefaultLeft());
+}
+
+TEST(Segment, Test) {
+  Segment s;
+  EXPECT_FALSE(s.isValid());
+  s.start = 2;
+  EXPECT_FALSE(s.isValid());
+  s.end = 4;
+  EXPECT_TRUE(s.isValid());
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/test/cpp/utils.cu
+++ b/plugin/updater_gpu/test/cpp/utils.cu
@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils.cuh"
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+std::shared_ptr<DMatrix> generateData(const std::string& file) {
+  std::shared_ptr<DMatrix> data(DMatrix::Load(file, false, false, "libsvm"));
+  return data;
+}
+
+std::shared_ptr<DMatrix> preparePluginInputs(const std::string &file,
+                                             std::vector<bst_gpair> *gpair) {
+  std::shared_ptr<DMatrix> dm = generateData(file);
+  gpair->reserve(dm->info().num_row);
+  for (int i = 0; i < dm->info().num_row; ++i) {
+    gpair->push_back(bst_gpair(1.f+static_cast<float>(i%10),
+                               0.5f+static_cast<float>(i%10)));
+  }
+  return dm;
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/test/cpp/utils.cuh
+++ b/plugin/updater_gpu/test/cpp/utils.cuh
@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <random>
+#include "../../src/exact/gradients.cuh"
+#include <memory>
+#include <string>
+#include <xgboost/data.h>
+#include "gtest/gtest.h"
+#include "../../src/exact/gpu_builder.cuh"
+#include "../../src/device_helpers.cuh"
+#include <vector>
+#include <stdlib.h>
+
+
+namespace xgboost {
+namespace tree {
+namespace exact {
+
+
+template <typename T>
+inline void allocateOnGpu(T*& arr, size_t nElems) {
+  dh::safe_cuda(cudaMalloc((void**)&arr, sizeof(T)*nElems));
+}
+
+template <typename T>
+inline void updateDevicePtr(T* dArr, const T* hArr, size_t nElems) {
+  dh::safe_cuda(cudaMemcpy(dArr, hArr, sizeof(T)*nElems, cudaMemcpyHostToDevice));
+}
+
+template <typename T>
+inline void updateHostPtr(T* hArr, const T* dArr, size_t nElems) {
+  dh::safe_cuda(cudaMemcpy(hArr, dArr, sizeof(T)*nElems, cudaMemcpyDeviceToHost));
+}
+
+template <typename T>
+inline void allocateAndUpdateOnGpu(T*& dArr, const T* hArr, size_t nElems) {
+  allocateOnGpu<T>(dArr, nElems);
+  updateDevicePtr<T>(dArr, hArr, nElems);
+}
+
+static const float Thresh = 0.005f;
+static const float SuperSmall = 0.001f;
+static const float SuperSmallThresh = 0.00001f;
+
+// lets assume dense matrix for simplicity
+template <typename T>
+class Generator {
+ public:
+  Generator(int nc, int nr, int nk, const std::string& tName):
+    nCols(nc), nRows(nr), nKeys(nk), size(nc*nr), hKeys(nullptr),
+    dKeys(nullptr), hVals(nullptr), dVals(nullptr), testName(tName),
+    dColIds(nullptr), hColIds(nullptr), dInstIds(nullptr),
+    hInstIds(nullptr) {
+    generateKeys();
+    generateVals();
+    // to simulate the same sorted key-value pairs in the main code
+    // don't need it as generateKeys always generates in sorted order!
+    //sortKeyValues();
+    evalColIds();
+    evalInstIds();
+  }
+
+  virtual ~Generator() {
+    delete [] hKeys;
+    delete [] hVals;
+    delete [] hColIds;
+    delete [] hInstIds;
+    dh::safe_cuda(cudaFree(dColIds));
+    dh::safe_cuda(cudaFree(dKeys));
+    dh::safe_cuda(cudaFree(dVals));
+    dh::safe_cuda(cudaFree(dInstIds));
+  }
+
+  virtual void run() = 0;
+
+protected:
+  int nCols;
+  int nRows;
+  int nKeys;
+  int size;
+  T* hKeys;
+  T* dKeys;
+  gpu_gpair* hVals;
+  gpu_gpair* dVals;
+  std::string testName;
+  int* dColIds;
+  int* hColIds;
+  int* dInstIds;
+  int* hInstIds;
+
+  void evalColIds() {
+    hColIds = new int[size];
+    for (int i=0;i<size;++i) {
+      hColIds[i] = i / nRows;
+    }
+    allocateAndUpdateOnGpu<int>(dColIds, hColIds, size);
+  }
+
+  void evalInstIds() {
+    hInstIds = new int[size];
+    for (int i=0;i<size;++i) {
+      hInstIds[i] = i;
+    }
+    allocateAndUpdateOnGpu<int>(dInstIds, hInstIds, size);
+  }
+
+  float diffRatio(float a, float b, bool& isSmall) {
+    isSmall = true;
+    if (a == 0.f) return fabs(b);
+    else if (b == 0.f) return fabs(a);
+    else if ((fabs(a) < SuperSmall) && (fabs(b) < SuperSmall)) {
+      return fabs(a - b);
+    }
+    else {
+      isSmall = false;
+      return fabs((a < b)? (b - a)/b : (a - b)/a);
+    }
+  }
+
+  void compare(gpu_gpair* exp, gpu_gpair* dAct, size_t len) {
+    gpu_gpair* act = new gpu_gpair[len];
+    updateHostPtr<gpu_gpair>(act, dAct, len);
+    for (size_t i=0;i<len;++i) {
+      bool isSmall;
+      float ratioG = diffRatio(exp[i].g, act[i].g, isSmall);
+      float ratioH = diffRatio(exp[i].h, act[i].h, isSmall);
+      float thresh = isSmall? SuperSmallThresh : Thresh;
+      if ((ratioG >= Thresh) || (ratioH >= Thresh)) {
+        printf("(exp) %f %f -> (act) %f %f : rG=%f rH=%f th=%f @%lu\n",
+               exp[i].g, exp[i].h, act[i].g, act[i].h, ratioG, ratioH,
+               thresh, i);
+      }
+      ASSERT_TRUE(ratioG < thresh);
+      ASSERT_TRUE(ratioH < thresh);
+    }
+    delete [] act;
+  }
+
+  void generateKeys() {
+    hKeys = new T[size];
+    T currKey = 0;
+    for (int i=0;i<size;++i) {
+      if (i % nRows == 0) { // start fresh for a new column
+        currKey = 0;
+      }
+      hKeys[i] = currKey;
+      float val = randVal();
+      if ((val > 0.8f) && (currKey < nKeys-1)) {
+        ++currKey;
+      }
+    }
+    allocateAndUpdateOnGpu<T>(dKeys, hKeys, size);
+  }
+
+  void generateVals() {
+    hVals = new gpu_gpair[size];
+    for (size_t i=0;i<size;++i) {
+      hVals[i].g = randVal(-1.f, 1.f);
+      hVals[i].h = randVal(-1.f, 1.f);
+    }
+    allocateAndUpdateOnGpu<gpu_gpair>(dVals, hVals, size);
+  }
+
+  void sortKeyValues() {
+    char* storage = nullptr;
+    size_t tmpSize;
+    dh::safe_cuda(cub::DeviceRadixSort::SortPairs(NULL, tmpSize, dKeys, dKeys,
+                                               dVals, dVals, size));
+    allocateOnGpu<char>(storage, tmpSize);
+    void* tmpStorage = static_cast<void*>(storage);
+    dh::safe_cuda(cub::DeviceRadixSort::SortPairs(tmpStorage, tmpSize, dKeys,
+                                               dKeys, dVals, dVals, size));
+    dh::safe_cuda(cudaFree(storage));
+    updateHostPtr<gpu_gpair>(hVals, dVals, size);
+    updateHostPtr<T>(hKeys, dKeys, size);
+  }
+
+  float randVal() const {
+    float val = rand() * 1.f / RAND_MAX;
+    return val;
+  }
+
+  float randVal(float min, float max) const {
+    float val = randVal();
+    val = (val * (max - min)) - min;
+    return val;
+  }
+};
+
+
+std::shared_ptr<DMatrix> generateData(const std::string& file);
+
+std::shared_ptr<DMatrix> preparePluginInputs(const std::string& file,
+                                             std::vector<bst_gpair> *gpair);
+
+template <typename node_id_t>
+std::shared_ptr<DMatrix> setupGPUBuilder(const std::string& file,
+                                         GPUBuilder<node_id_t> &builder,
+                                         int max_depth=1) {
+  std::vector<bst_gpair> gpair;
+  std::shared_ptr<DMatrix> dm = preparePluginInputs(file, &gpair);
+  TrainParam p;
+  RegTree tree;
+  p.min_split_loss = 0.f;
+  p.max_depth = max_depth;
+  p.min_child_weight = 0.f;
+  p.reg_alpha = 0.f;
+  p.reg_lambda = 1.f;
+  p.max_delta_step = 0.f;
+  builder.Init(p);
+  builder.Update(gpair, dm.get(), &tree);
+  return dm;
+}
+
+}  // namespace exact
+}  // namespace tree
+}  // namespace xgboost
--- a/plugin/updater_gpu/test/python/test.py
+++ b/plugin/updater_gpu/test/python/test.py
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@ -62,7 +62,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
    num_col_batch += 1;
    EXPECT_EQ(col_iter->Value().size, dmat->info().num_col)
      << "Expected batch size = number of cells as #batches is 1.";
-    for (int i = 0; i < col_iter->Value().size; ++i) {
+    for (int i = 0; i < static_cast<int>(col_iter->Value().size); ++i) {
      EXPECT_EQ(col_iter->Value()[i].length, dmat->GetColSize(i))
        << "Expected length of each colbatch = colsize as #batches is 1.";
    }
@ -106,7 +106,7 @@ TEST(SimpleDMatrix, ColAccessWithBatches) {
    num_col_batch += 1;
    EXPECT_EQ(col_iter->Value().size, dmat->info().num_col)
      << "Expected batch size = num_cols as max_row_perbatch is 1.";
-    for (int i = 0; i < col_iter->Value().size; ++i) {
+    for (int i = 0; i < static_cast<int>(col_iter->Value().size); ++i) {
      EXPECT_LE(col_iter->Value()[i].length, 1)
        << "Expected length of each colbatch <=1 as max_row_perbatch is 1.";
    }
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@ -40,7 +40,7 @@ void CheckObjFunction(xgboost::ObjFunction * obj,
  obj->GetGradient(preds, info, 1, &gpair);

  ASSERT_EQ(gpair.size(), preds.size());
-  for (int i = 0; i < gpair.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(gpair.size()); ++i) {
    EXPECT_NEAR(gpair[i].grad, out_grad[i], 0.01)
      << "Unexpected grad for pred=" << preds[i] << " label=" << labels[i]
      << " weight=" << weights[i];
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@ -49,7 +49,7 @@ TEST(Objective, LogisticRegressionBasic) {
  std::vector<xgboost::bst_float> preds = {0, 0.1, 0.5, 0.9, 1};
  std::vector<xgboost::bst_float> out_preds = {0.5, 0.524, 0.622, 0.710, 0.731};
  obj->PredTransform(&preds);
-  for (int i = 0; i < preds.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
    EXPECT_NEAR(preds[i], out_preds[i], 0.01);
  }
 }
@ -97,7 +97,7 @@ TEST(Objective, PoissonRegressionBasic) {
  std::vector<xgboost::bst_float> preds = {0, 0.1, 0.5, 0.9, 1};
  std::vector<xgboost::bst_float> out_preds = {1, 1.10, 1.64, 2.45, 2.71};
  obj->PredTransform(&preds);
-  for (int i = 0; i < preds.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
    EXPECT_NEAR(preds[i], out_preds[i], 0.01);
  }
 }
@ -132,7 +132,7 @@ TEST(Objective, GammaRegressionBasic) {
  std::vector<xgboost::bst_float> preds = {0, 0.1, 0.5, 0.9, 1};
  std::vector<xgboost::bst_float> out_preds = {1, 1.10, 1.64, 2.45, 2.71};
  obj->PredTransform(&preds);
-  for (int i = 0; i < preds.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
    EXPECT_NEAR(preds[i], out_preds[i], 0.01);
  }
 }
@ -168,7 +168,7 @@ TEST(Objective, TweedieRegressionBasic) {
  std::vector<xgboost::bst_float> preds = {0, 0.1, 0.5, 0.9, 1};
  std::vector<xgboost::bst_float> out_preds = {1, 1.10, 1.64, 2.45, 2.71};
  obj->PredTransform(&preds);
-  for (int i = 0; i < preds.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
    EXPECT_NEAR(preds[i], out_preds[i], 0.01);
  }
 }
--- a/tests/cpp/xgboost_test.mk
+++ b/tests/cpp/xgboost_test.mk
@ -1,9 +1,21 @@
 UTEST_ROOT=tests/cpp
 UTEST_OBJ_ROOT=build_$(UTEST_ROOT)
 UNITTEST=$(UTEST_ROOT)/xgboost_test
+
 UNITTEST_SRC=$(wildcard $(UTEST_ROOT)/*.cc $(UTEST_ROOT)/*/*.cc)
 UNITTEST_OBJ=$(patsubst $(UTEST_ROOT)%.cc, $(UTEST_OBJ_ROOT)%.o, $(UNITTEST_SRC))

+# for if and when we add cuda source files into xgboost core
+UNITTEST_CU_SRC=$(wildcard $(UTEST_ROOT)/*.cu $(UTEST_ROOT)/*/*.cu)
+UNITTEST_OBJ += $(patsubst $(UTEST_ROOT)%.cu, $(UTEST_OBJ_ROOT)%.o, $(UNITTEST_CU_SRC))
+
+# tests from grow_gpu plugin (only if CUDA path is enabled!)
+ifeq ($(PLUGIN_UPDATER_GPU),ON)
+  GPU_PLUGIN_FOLDER = plugin/updater_gpu
+  UNITTEST_CU_PLUGIN_SRC = $(wildcard $(GPU_PLUGIN_FOLDER)/test/cpp/*.cu)
+  UNITTEST_OBJ += $(patsubst %.cu, $(UTEST_OBJ_ROOT)/%.o, $(UNITTEST_CU_PLUGIN_SRC))
+endif
+
 GTEST_LIB=$(GTEST_PATH)/lib/
 GTEST_INC=$(GTEST_PATH)/include/

@ -13,6 +25,19 @@ UNITTEST_DEPS=lib/libxgboost.a $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)

 COVER_OBJ=$(patsubst %.o, %.gcda, $(ALL_OBJ)) $(patsubst %.o, %.gcda, $(UNITTEST_OBJ))

+# the order of the below targets matter!
+$(UTEST_OBJ_ROOT)/$(GPU_PLUGIN_FOLDER)/test/cpp/%.o: $(GPU_PLUGIN_FOLDER)/test/cpp/%.cu
+	@mkdir -p $(@D)
+	$(NVCC) $(NVCC_FLAGS) -I$(GTEST_INC) -o $@ -c $<
+
+$(UTEST_OBJ_ROOT)/%.o: $(UTEST_ROOT)/%.cu
+	@mkdir -p $(@D)
+	$(NVCC) $(NVCC_FLAGS) -I$(GTEST_INC) -o $@ -c $<
+
+$(UTEST_OBJ_ROOT)/$(GTEST_PATH)/%.o: $(GTEST_PATH)/%.cc
+	@mkdir -p $(@D)
+	$(CXX) $(UNITTEST_CFLAGS) -I$(GTEST_INC) -I$(GTEST_PATH) -o $@ -c $<
+
 $(UTEST_OBJ_ROOT)/%.o: $(UTEST_ROOT)/%.cc
 	@mkdir -p $(@D)
 	$(CXX) $(UNITTEST_CFLAGS) -I$(GTEST_INC) -o $@ -c $<
				`@ -0,0 +1 @@`
				`Subproject commit 89de7ab20167909bc2c4f8acd397671c47cf3c0d`