diff --git a/.gitmodules b/.gitmodules
index dbf7ee1a4..f271ce442 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,6 +4,9 @@
 [submodule "rabit"]
 	path = rabit
 	url = https://github.com/dmlc/rabit
+[submodule "nccl"]
+	path = nccl
+	url = https://github.com/dmlc/nccl
 [submodule "cub"]
 	path = cub
 	url = https://github.com/NVlabs/cub
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a44cbe0ed..a4a764213 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,40 +94,58 @@ if(MSVC)
 else()
     set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
     set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
+    #Prevent shared library being called liblibxgboost.so on Linux
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
 endif()
 
 set(LINK_LIBRARIES dmlccore rabit)
 
 if(PLUGIN_UPDATER_GPU)
-    #Find cub
-    set(CUB_DIRECTORY "cub/" CACHE PATH "CUB 1.5.4 directory")
-    include_directories(${CUB_DIRECTORY})
+    # nccl
+    set(LINK_LIBRARIES ${LINK_LIBRARIES} nccl)
+    add_subdirectory(nccl)
+    set(NCCL_DIRECTORY ${PROJECT_SOURCE_DIR}/nccl)
+    include_directories(${NCCL_DIRECTORY}/src)
+    set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
+
+	#Find cub
+	set(CUB_DIRECTORY ${PROJECT_SOURCE_DIR}/cub/)
+	include_directories(${CUB_DIRECTORY})
+
     #Find googletest
     set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
     include_directories(${GTEST_DIRECTORY}/include)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_60;-lineinfo;")
-    if(NOT MSVC)
-      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
-    endif()
+
+    # plugin
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lineinfo;--expt-extended-lambda")
     set(CUDA_SOURCES
       plugin/updater_gpu/src/updater_gpu.cu
       plugin/updater_gpu/src/gpu_hist_builder.cu
-    )
-    cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
-    set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
+      )
+    include(${PROJECT_SOURCE_DIR}/cmake/Utils.cmake)
+    include(${PROJECT_SOURCE_DIR}/cmake/Cuda.cmake)
+    # use below for forcing specific arch
+    #cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS} -arch=compute_52)
+    # use below for auto-detect, but gpu_grow currently doesn't work with 61
+    xgboost_cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
+    if(MSVC)
+    else()
+      cuda_add_library(updater_gpu STATIC ${CUDA_SOURCES})
+      set(LINK_LIBRARIES ${LINK_LIBRARIES} updater_gpu)
+    endif()
 else()
     set(CUDA_OBJS "")
+    set(updater_gpu "")
 endif()
 
 add_library(objxgboost OBJECT ${SOURCES})
 set_target_properties(${objxgboost} PROPERTIES POSITION_INDEPENDENT_CODE 1)
 
-add_executable(runxgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
-set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
-target_link_libraries(runxgboost ${LINK_LIBRARIES})
+add_library(libxgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
+add_executable(xgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
 
-add_library(xgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
 target_link_libraries(xgboost ${LINK_LIBRARIES})
+target_link_libraries(libxgboost ${LINK_LIBRARIES})
 
 option(JVM_BINDINGS "Build JVM bindings" OFF)
 
@@ -136,11 +154,11 @@ if(JVM_BINDINGS)
 
     include_directories(${JNI_INCLUDE_DIRS} jvm-packages/xgboost4j/src/native)
 
-    add_library(xgboost4j SHARED
+    add_library(libxgboost4j SHARED
         $<TARGET_OBJECTS:objxgboost>
         ${CUDA_OBJS}
         jvm-packages/xgboost4j/src/native/xgboost4j.cpp)
-    target_link_libraries(xgboost4j
+    target_link_libraries(libxgboost4j
         ${LINK_LIBRARIES}
         ${JNI_LIBRARIES})
 endif()
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
new file mode 100644
index 000000000..30c5139d5
--- /dev/null
+++ b/cmake/Cuda.cmake
@@ -0,0 +1,289 @@
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   mshadow_detect_installed_gpus(out_variable)
+function(xgboost_detect_installed_gpus out_variable)
+set(CUDA_gpu_detect_output "")
+  if(NOT CUDA_gpu_detect_output)
+    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${__cufile} ""
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    if(MSVC)
+      #find vcvarsall.bat and run it building msvc environment
+      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
+      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
+                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
+                      ERROR_QUIET
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    else()
+      if(CUDA_LIBRARY_PATH)
+        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
+      endif()
+      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
+                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
+                      ERROR_QUIET
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
+    if(__nvcc_res EQUAL 0)
+      # nvcc outputs text containing line breaks when building with MSVC.
+      # The line below prevents CMake from inserting a variable with line
+      # breaks in the cache
+      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
+      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
+      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from xgboost_detect_gpus tool" FORCE)
+    else()
+      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${xgboost_known_gpu_archs}).")
+    set(${out_variable} ${xgboost_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   xgboost_select_nvcc_arch_flags(out_variable)
+function(xgboost_select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(__archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND __archs_names "Auto")
+    set(__archs_name_default "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " __archs_names "${__archs_names}")
+    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${xgboost_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
+    set(__cuda_arch_bin "20 21(20)")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(__cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(__cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(__cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(__cuda_arch_bin ${xgboost_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    xgboost_detect_installed_gpus(__cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
+  xgboost_list_unique(__cuda_arch_bin __cuda_arch_ptx)
+
+  set(__nvcc_flags "")
+  set(__nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(__arch ${__cuda_arch_bin})
+    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
+      list(APPEND __nvcc_archs_readable sm_${__arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(__arch ${__cuda_arch_ptx})
+    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
+    list(APPEND __nvcc_archs_readable compute_${__arch})
+  endforeach()
+
+  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
+  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Short command for cuda comnpilation
+# Usage:
+#   xgboost_cuda_compile(<objlist_variable> <cuda_files>)
+macro(xgboost_cuda_compile objlist_variable)
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var}_backup_in_cuda_compile_ "${${var}}")
+
+    # we remove /EHa as it generates warnings under windows
+    string(REPLACE "/EHa" "" ${var} "${${var}}")
+
+  endforeach()
+  if(UNIX OR APPLE)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
+  endif()
+
+  if(APPLE)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
+  endif()
+
+  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G -lineinfo")
+
+  if(MSVC)
+    # disable noisy warnings:
+    # 4819: The file contains a character that cannot be represented in the current code page (number).
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
+
+  # If the build system is a container, make sure the nvcc intermediate files
+  # go into the build output area rather than in /tmp, which may run out of space
+  if(IS_CONTAINER_BUILD)
+    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
+    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
+  endif()
+
+  cuda_compile(cuda_objcs ${ARGN})
+
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var} "${${var}_backup_in_cuda_compile_}")
+    unset(${var}_backup_in_cuda_compile_)
+  endforeach()
+
+  set(${objlist_variable} ${cuda_objcs})
+endmacro()
+
+
+################################################################################################
+###  Non macro section
+################################################################################################
+
+# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
+if(NOT CUDA_TOOLKIT_ROOT_DIR)
+  find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
+  if(CUDA_LIBRARY_PATH)
+    get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
+    set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
+  endif()
+endif()
+
+find_package(CUDA 5.5 QUIET REQUIRED)
+find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
+
+if(NOT CUDA_FOUND)
+  return()
+endif()
+
+set(HAVE_CUDA TRUE)
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+list(APPEND xgboost_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
+                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+
+# Known NVIDIA GPU achitectures xgboost can be compiled for.
+# This list will be used for CUDA_ARCH_NAME = All option
+if(CUDA_ARCH_ALL)
+  set(xgboost_known_gpu_archs "${CUDA_ARCH_ALL}")
+else()
+  if(${CUDA_VERSION} GREATER 7.5)
+    set(xgboost_known_gpu_archs "30 35 50 52 60 61")
+  else()
+    set(xgboost_known_gpu_archs "30 35 50 52")
+  endif()
+endif()
+
+# cudnn detection
+if(USE_CUDNN)
+  detect_cuDNN()
+  if(HAVE_CUDNN)
+    add_definitions(-DUSE_CUDNN)
+    include_directories(SYSTEM ${CUDNN_INCLUDE})
+    list(APPEND xgboost_LINKER_LIBS ${CUDNN_LIBRARY})
+  endif()
+endif()
+
+# setting nvcc arch flags
+xgboost_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
+# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
+if(Boost_VERSION EQUAL 105500)
+  message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
+  # avoid warning for CMake >= 2.8.12
+  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
+endif()
+
+# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
+foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
+  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
+endforeach()
+
+# setting default testing device
+if(NOT CUDA_TEST_DEVICE)
+  set(CUDA_TEST_DEVICE -1)
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
+
+# Handle clang/libc++ issue
+if(APPLE)
+  xgboost_detect_darwin_version(OSX_VERSION)
+
+  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
+  if(OSX_VERSION VERSION_GREATER 10.8)
+    # enabled by default if and only if CUDA version is less than 7.0
+    xgboost_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
+  endif()
+endif()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
new file mode 100644
index 000000000..8b737f8b7
--- /dev/null
+++ b/cmake/Utils.cmake
@@ -0,0 +1,398 @@
+################################################################################################
+# Command alias for debugging messages
+# Usage:
+#   dmsg(<message>)
+function(dmsg)
+  message(STATUS ${ARGN})
+endfunction()
+
+################################################################################################
+# Removes duplicates from list(s)
+# Usage:
+#   xgboost_list_unique(<list_variable> [<list_variable>] [...])
+macro(xgboost_list_unique)
+  foreach(__lst ${ARGN})
+    if(${__lst})
+      list(REMOVE_DUPLICATES ${__lst})
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Clears variables from list
+# Usage:
+#   xgboost_clear_vars(<variables_list>)
+macro(xgboost_clear_vars)
+  foreach(_var ${ARGN})
+    unset(${_var})
+  endforeach()
+endmacro()
+
+################################################################################################
+# Removes duplicates from string
+# Usage:
+#   xgboost_string_unique(<string_variable>)
+function(xgboost_string_unique __string)
+  if(${__string})
+    set(__list ${${__string}})
+    separate_arguments(__list)
+    list(REMOVE_DUPLICATES __list)
+    foreach(__e ${__list})
+      set(__str "${__str} ${__e}")
+    endforeach()
+    set(${__string} ${__str} PARENT_SCOPE)
+  endif()
+endfunction()
+
+################################################################################################
+# Prints list element per line
+# Usage:
+#   xgboost_print_list(<list>)
+function(xgboost_print_list)
+  foreach(e ${ARGN})
+    message(STATUS ${e})
+  endforeach()
+endfunction()
+
+################################################################################################
+# Function merging lists of compiler flags to single string.
+# Usage:
+#   xgboost_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
+function(xgboost_merge_flag_lists out_var)
+  set(__result "")
+  foreach(__list ${ARGN})
+    foreach(__flag ${${__list}})
+      string(STRIP ${__flag} __flag)
+      set(__result "${__result} ${__flag}")
+    endforeach()
+  endforeach()
+  string(STRIP ${__result} __result)
+  set(${out_var} ${__result} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Converts all paths in list to absolute
+# Usage:
+#   xgboost_convert_absolute_paths(<list_variable>)
+function(xgboost_convert_absolute_paths variable)
+  set(__dlist "")
+  foreach(__s ${${variable}})
+    get_filename_component(__abspath ${__s} ABSOLUTE)
+    list(APPEND __list ${__abspath})
+  endforeach()
+  set(${variable} ${__list} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Reads set of version defines from the header file
+# Usage:
+#   xgboost_parse_header(<file> <define1> <define2> <define3> ..)
+macro(xgboost_parse_header FILENAME FILE_VAR)
+  set(vars_regex "")
+  set(__parnet_scope OFF)
+  set(__add_cache OFF)
+  foreach(name ${ARGN})
+    if("${name}" STREQUAL "PARENT_SCOPE")
+      set(__parnet_scope ON)
+    elseif("${name}" STREQUAL "CACHE")
+      set(__add_cache ON)
+    elseif(vars_regex)
+      set(vars_regex "${vars_regex}|${name}")
+    else()
+      set(vars_regex "${name}")
+    endif()
+  endforeach()
+  if(EXISTS "${FILENAME}")
+    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
+  else()
+    unset(${FILE_VAR})
+  endif()
+  foreach(name ${ARGN})
+    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
+      if(${FILE_VAR})
+        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
+          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
+        else()
+          set(${name} "")
+        endif()
+        if(__add_cache)
+          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
+        elseif(__parnet_scope)
+          set(${name} "${${name}}" PARENT_SCOPE)
+        endif()
+      else()
+        unset(${name} CACHE)
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Reads single version define from the header file and parses it
+# Usage:
+#   xgboost_parse_header_single_define(<library_name> <file> <define_name>)
+function(xgboost_parse_header_single_define LIBNAME HDR_PATH VARNAME)
+  set(${LIBNAME}_H "")
+  if(EXISTS "${HDR_PATH}")
+    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
+  endif()
+
+  if(${LIBNAME}_H)
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
+    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+
+    # append a TWEAK version if it exists:
+    set(${LIBNAME}_VERSION_TWEAK "")
+    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
+      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
+    endif()
+    if(${LIBNAME}_VERSION_TWEAK)
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
+    else()
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
+########################################################################################################
+# An option that the user can select. Can accept condition to control when option is available for user.
+# Usage:
+#   xgboost_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
+function(xgboost_option variable description value)
+  set(__value ${value})
+  set(__condition "")
+  set(__varname "__value")
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "IF" OR arg STREQUAL "if")
+      set(__varname "__condition")
+    else()
+      list(APPEND ${__varname} ${arg})
+    endif()
+  endforeach()
+  unset(__varname)
+  if("${__condition}" STREQUAL "")
+    set(__condition 2 GREATER 1)
+  endif()
+
+  if(${__condition})
+    if("${__value}" MATCHES ";")
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    elseif(DEFINED ${__value})
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    else()
+      option(${variable} "${description}" ${__value})
+    endif()
+  else()
+    unset(${variable} CACHE)
+  endif()
+endfunction()
+
+################################################################################################
+# Utility macro for comparing two lists. Used for CMake debugging purposes
+# Usage:
+#   xgboost_compare_lists(<list_variable> <list2_variable> [description])
+function(xgboost_compare_lists list1 list2 desc)
+  set(__list1 ${${list1}})
+  set(__list2 ${${list2}})
+  list(SORT __list1)
+  list(SORT __list2)
+  list(LENGTH __list1 __len1)
+  list(LENGTH __list2 __len2)
+
+  if(NOT ${__len1} EQUAL ${__len2})
+    message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
+  endif()
+
+  foreach(__i RANGE 1 ${__len1})
+    math(EXPR __index "${__i}- 1")
+    list(GET __list1 ${__index} __item1)
+    list(GET __list2 ${__index} __item2)
+    if(NOT ${__item1} STREQUAL ${__item2})
+      message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
+    endif()
+  endforeach()
+endfunction()
+
+################################################################################################
+# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
+# Usage:
+#   xgboost_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
+macro(xgboost_warnings_disable)
+  set(_flag_vars "")
+  set(_msvc_warnings "")
+  set(_gxx_warnings "")
+
+  foreach(arg ${ARGN})
+    if(arg MATCHES "^CMAKE_")
+      list(APPEND _flag_vars ${arg})
+    elseif(arg MATCHES "^/wd")
+      list(APPEND _msvc_warnings ${arg})
+    elseif(arg MATCHES "^-W")
+      list(APPEND _gxx_warnings ${arg})
+    endif()
+  endforeach()
+
+  if(NOT _flag_vars)
+    set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  if(MSVC AND _msvc_warnings)
+    foreach(var ${_flag_vars})
+      foreach(warning ${_msvc_warnings})
+        set(${var} "${${var}} ${warning}")
+      endforeach()
+    endforeach()
+  elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
+    foreach(var ${_flag_vars})
+      foreach(warning ${_gxx_warnings})
+        if(NOT warning MATCHES "^-Wno-")
+          string(REPLACE "${warning}" "" ${var} "${${var}}")
+          string(REPLACE "-W" "-Wno-" warning "${warning}")
+        endif()
+        set(${var} "${${var}} ${warning}")
+      endforeach()
+    endforeach()
+  endif()
+  xgboost_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
+endmacro()
+
+################################################################################################
+# Helper function get current definitions
+# Usage:
+#   xgboost_get_current_definitions(<definitions_variable>)
+function(xgboost_get_current_definitions definitions_var)
+  get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
+  set(result "")
+
+  foreach(d ${current_definitions})
+    list(APPEND result -D${d})
+  endforeach()
+
+  xgboost_list_unique(result)
+  set(${definitions_var} ${result} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function get current includes/definitions
+# Usage:
+#   xgboost_get_current_cflags(<cflagslist_variable>)
+function(xgboost_get_current_cflags cflags_var)
+  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
+  xgboost_convert_absolute_paths(current_includes)
+  xgboost_get_current_definitions(cflags)
+
+  foreach(i ${current_includes})
+    list(APPEND cflags "-I${i}")
+  endforeach()
+
+  xgboost_list_unique(cflags)
+  set(${cflags_var} ${cflags} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function to parse current linker libs into link directories, libflags and osx frameworks
+# Usage:
+#   xgboost_parse_linker_libs(<xgboost_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
+function(xgboost_parse_linker_libs xgboost_LINKER_LIBS_variable folders_var flags_var frameworks_var)
+
+  set(__unspec "")
+  set(__debug "")
+  set(__optimized "")
+  set(__framework "")
+  set(__varname "__unspec")
+
+  # split libs into debug, optimized, unspecified and frameworks
+  foreach(list_elem ${${xgboost_LINKER_LIBS_variable}})
+    if(list_elem STREQUAL "debug")
+      set(__varname "__debug")
+    elseif(list_elem STREQUAL "optimized")
+      set(__varname "__optimized")
+    elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
+      list(APPEND __framework -framework ${CMAKE_MATCH_1})
+    else()
+      list(APPEND ${__varname} ${list_elem})
+      set(__varname "__unspec")
+    endif()
+  endforeach()
+
+  # attach debug or optimized libs to unspecified according to current configuration
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(__libs ${__unspec} ${__debug})
+  else()
+    set(__libs ${__unspec} ${__optimized})
+  endif()
+
+  set(libflags "")
+  set(folders "")
+
+  # convert linker libraries list to link flags
+  foreach(lib ${__libs})
+    if(TARGET ${lib})
+      list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
+      list(APPEND libflags -l${lib})
+    elseif(lib MATCHES "^-l.*")
+      list(APPEND libflags ${lib})
+    elseif(IS_ABSOLUTE ${lib})
+      get_filename_component(name_we ${lib} NAME_WE)
+      get_filename_component(folder  ${lib} PATH)
+
+      string(REGEX MATCH "^lib(.*)" __match ${name_we})
+      list(APPEND libflags -l${CMAKE_MATCH_1})
+      list(APPEND folders    ${folder})
+    else()
+      message(FATAL_ERROR "Logic error. Need to update cmake script")
+    endif()
+  endforeach()
+
+  xgboost_list_unique(libflags folders)
+
+  set(${folders_var} ${folders} PARENT_SCOPE)
+  set(${flags_var} ${libflags} PARENT_SCOPE)
+  set(${frameworks_var} ${__framework} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
+# Usage:
+#   xgboost_detect_darwin_version(<version_variable>)
+function(xgboost_detect_darwin_version output_var)
+  if(APPLE)
+    execute_process(COMMAND /usr/bin/sw_vers -productVersion
+                    RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
+  else()
+    set(${output_var} "" PARENT_SCOPE)
+  endif()
+endfunction()
+
+################################################################################################
+# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
+# Usage:
+#   caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
+function(xgboost_source_group group)
+  cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
+  if(CAFFE_SOURCE_GROUP_GLOB)
+    file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
+    source_group(${group} FILES ${srcs1})
+  endif()
+
+  if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
+    file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
+    source_group(${group} FILES ${srcs2})
+  endif()
+endfunction()
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index eb2ce129f..bfd302ef1 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -24,7 +24,11 @@
 // helper functions
 // set handle
 void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
-  jlong out = (jlong) handle;
+#ifdef __APPLE__
+  jlong out = (long) handle;
+#else
+  int64_t out = (int64_t) handle;
+#endif
   jenv->SetLongArrayRegion(jhandle, 0, 1, &out);
 }
 
@@ -32,7 +36,7 @@ void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
 static JavaVM* global_jvm = nullptr;
 
 // overrides JNI on load
-JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
+jint JNI_OnLoad(JavaVM *vm, void *reserved) {
   global_jvm = vm;
   return JNI_VERSION_1_6;
 }
@@ -72,7 +76,7 @@ XGB_EXTERN_C int XGBoost4jCallbackDataIterNext(
         batch, jenv->GetFieldID(batchClass, "featureValue", "[F"));
       XGBoostBatchCSR cbatch;
       cbatch.size = jenv->GetArrayLength(joffset) - 1;
-      cbatch.offset = reinterpret_cast<jlong *>(
+      cbatch.offset = reinterpret_cast<long *>(
           jenv->GetLongArrayElements(joffset, 0));
       if (jlabel != nullptr) {
         cbatch.label = jenv->GetFloatArrayElements(jlabel, 0);
diff --git a/nccl b/nccl
new file mode 160000
index 000000000..93183bca9
--- /dev/null
+++ b/nccl
@@ -0,0 +1 @@
+Subproject commit 93183bca921b2e8e1754e27e1b43d73cf6caec9d
diff --git a/plugin/updater_gpu/README.md b/plugin/updater_gpu/README.md
index 920f47493..283e730bb 100644
--- a/plugin/updater_gpu/README.md
+++ b/plugin/updater_gpu/README.md
@@ -17,8 +17,11 @@ colsample_bytree | &#10004; | &#10004;|
 colsample_bylevel | &#10004; | &#10004; |
 max_bin | &#10006; | &#10004; |
 gpu_id | &#10004; | &#10004; | 
+n_gpus | &#10006; | &#10004; | 
 
-All algorithms currently use only a single GPU. The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
+The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
+
+Multiple GPUs can be used with the grow_gpu_hist parameter using the n_gpus parameter, which defaults to -1 (indicating use all visible GPUs).  If gpu_id is specified as non-zero, the gpu device order is mod(gpu_id + i) % n_visible_devices for i=0 to n_gpus-1.  As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance.  For example, when n_features * n_bins * 2^depth divided by time of each round/iteration becomes comparable to the real PCI 16x bus bandwidth of order 4GB/s to 10GB/s, then AllReduce will dominant code speed and multiple GPUs become ineffective at increasing performance.  Also, CPU overhead between GPU calls can limit usefulness of multiple GPUs.
 
 This plugin currently works with the CLI version and python version.
 
@@ -54,29 +57,38 @@ $ python -m nose test/python/
 ## Dependencies
 A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).
 
-Building the plug-in requires CUDA Toolkit 7.5 or later.
+Building the plug-in requires CUDA Toolkit 7.5 or later (https://developer.nvidia.com/cuda-downloads)
 
+submodule: The plugin also depends on CUB 1.6.4 - https://nvlabs.github.io/cub/ . CUB is a header only cuda library which provides sort/reduce/scan primitives.
+
+submodule: NVIDIA NCCL from https://github.com/NVIDIA/nccl with windows port allowed by git@github.com:h2oai/nccl.git
 
 ## Build
 
-### Using cmake
-To use the plugin xgboost must be built by specifying the option PLUGIN_UPDATER_GPU=ON. CMake will prepare a build system depending on which platform you are on.
+From the command line on Linux starting from the xgboost directory:
 
 On Linux, from the xgboost directory:
 ```bash
 $ mkdir build
 $ cd build
 $ cmake .. -DPLUGIN_UPDATER_GPU=ON
-$ make
+$ make -j
 ```
-If 'make' fails try invoking make again. There can sometimes be problems with the order items are built.
-
-On Windows you may also need to specify your generator as 64 bit, so the cmake command becomes:
+On Windows using cmake, see what options for Generators you have for cmake, and choose one with [arch] replaced by Win64:
 ```bash
-$ cmake .. -G"Visual Studio 12 2013 Win64" -DPLUGIN_UPDATER_GPU=ON
+cmake -help
 ```
-You may also  be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.
-cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode. This is also a good time to check it is being built as x64. If not make sure the cmake generator is set correctly.
+Then run cmake as:
+```bash
+$ mkdir build
+$ cd build
+$ cmake .. -G"Visual Studio 14 2015 Win64" -DPLUGIN_UPDATER_GPU=ON
+```
+Cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build.
+
+Visual studio community 2015, supported by cuda toolkit (http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/#axzz4isREr2nS), can be downloaded from: https://my.visualstudio.com/Downloads?q=Visual%20Studio%20Community%202015 .  You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.  Note that Mingw cannot be used with cuda.
+
+### For Developers!
 
 ### Using make
 Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
@@ -84,9 +96,6 @@ Now, it also supports the usual 'make' flow to build gpu-enabled tree constructi
 # make sure CUDA SDK bin directory is in the 'PATH' env variable
 $ make PLUGIN_UPDATER_GPU=ON
 ```
-
-### For Developers!
-
 Now, some of the code-base inside gpu plugins have googletest unit-tests inside 'tests/'.
 They can be enabled run along with other unit-tests inside '<xgboostRoot>/tests/cpp' using:
 ```bash
@@ -98,10 +107,17 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
 ```
 
 ## Changelog
+##### 2017/6/5
+
+* Multi-GPU support for histogram method using NVIDIA NCCL.
+
 ##### 2017/5/31
 * Faster version of the grow_gpu plugin
 * Added support for building gpu plugin through 'make' flow too
 
+##### 2017/5/19
+* Further performance enhancements for histogram method.
+
 ##### 2017/5/5
 * Histogram performance improvements
 * Fix gcc build issues 
@@ -115,10 +131,19 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
 [Mitchell, Rory, and Eibe Frank. Accelerating the XGBoost algorithm using GPU computing. No. e2911v1. PeerJ Preprints, 2017.](https://peerj.com/preprints/2911/)
 
 ## Author
-Rory Mitchell 
-
-Please report bugs to the xgboost/issues page. You can tag me with @RAMitchell.
-
-Otherwise I can be contacted at r.a.mitchell.nz at gmail.
+<<<<<<< HEAD
+Rory Mitchell,
+Jonathan C. McKinney,
+Shankara Rao Thejaswi Nanditale,
+Vinay Deshpande,
+and the rest of the H2O.ai and NVIDIA team.
+=======
+Rory Mitchell
+Jonathan C. McKinney
+Shankara Rao Thejaswi Nanditale
+Vinay Deshpande
+... and the rest of the H2O.ai and NVIDIA team.
+>>>>>>> d2fbbdf4a39fa1f0af5cbd59a7912cf5caade34e
 
+Please report bugs to the xgboost/issues page.
 
diff --git a/plugin/updater_gpu/src/common.cuh b/plugin/updater_gpu/src/common.cuh
index 07b858703..36086ed66 100644
--- a/plugin/updater_gpu/src/common.cuh
+++ b/plugin/updater_gpu/src/common.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
  */
 #pragma once
 #include <vector>
@@ -147,7 +147,8 @@ inline void dense2sparse_tree(RegTree* p_tree,
 }
 
 // Set gradient pair to 0 with p = 1 - subsample
-inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
+inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample,
+                            int offset) {
   if (subsample == 1.0) {
     return;
   }
@@ -157,13 +158,19 @@ inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
   auto d_gpair = gpair.data();
   dh::BernoulliRng rng(subsample, common::GlobalRandom()());
 
-  dh::launch_n(gpair.size(), [=] __device__(int i) {
-    if (!rng(i)) {
+  dh::launch_n(gpair.device_idx(), gpair.size(), [=] __device__(int i) {
+    if (!rng(i + offset)) {
       d_gpair[i] = gpu_gpair();
     }
   });
 }
 
+// Set gradient pair to 0 with p = 1 - subsample
+inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
+  int offset = 0;
+  subsample_gpair(p_gpair, subsample, offset);
+}
+
 inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
   int n = colsample * features.size();
   CHECK_GT(n, 0);
@@ -233,8 +240,8 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
  * @param def default value to be filled
  */
 template <typename T, int BlkDim=256, int ItemsPerThread=4>
-void fillConst(T* out, int len, T def) {
-  dh::launch_n<ItemsPerThread,BlkDim>(len, [=] __device__(int i) { out[i] = def; });
+void fillConst(int device_idx, T* out, int len, T def) {
+  dh::launch_n<ItemsPerThread,BlkDim>(device_idx, len, [=] __device__(int i) { out[i] = def; });
 }
 
 /**
@@ -247,10 +254,10 @@ void fillConst(T* out, int len, T def) {
  * @param nVals length of the buffers
  */
 template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
-void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
+void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
             int nVals) {
   dh::launch_n<ItemsPerThread,BlkDim>
-      (nVals, [=] __device__(int i) {
+    (device_idx, nVals, [=] __device__(int i) {
                   int iid = instId[i];
                   T1 v1 = in1[iid];
                   T2 v2 = in2[iid];
@@ -267,9 +274,9 @@ void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
  * @param nVals length of the buffers
  */
 template <typename T, int BlkDim=256, int ItemsPerThread=4>
-void gather(T* out, const T* in, const int* instId, int nVals) {
+void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
   dh::launch_n<ItemsPerThread,BlkDim>
-      (nVals, [=] __device__(int i) {
+    (device_idx, nVals, [=] __device__(int i) {
                   int iid = instId[i];
                   out[i] = in[iid];
               });
diff --git a/plugin/updater_gpu/src/device_helpers.cuh b/plugin/updater_gpu/src/device_helpers.cuh
index ae14d7cc6..5a90b6170 100644
--- a/plugin/updater_gpu/src/device_helpers.cuh
+++ b/plugin/updater_gpu/src/device_helpers.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
  */
 #pragma once
 #include <thrust/device_vector.h>
@@ -12,11 +12,20 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <numeric>
 #include <cub/cub.cuh>
 
+#ifndef NCCL
+#define NCCL 1
+#endif
+
+#if (NCCL)
+#include "nccl.h"
+#endif
+
 // Uncomment to enable
 // #define DEVICE_TIMER
-// #define TIMERS
+#define TIMERS
 
 namespace dh {
 
@@ -42,6 +51,22 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
   return code;
 }
 
+#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
+
+#if (NCCL)
+inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
+                                        int line) {
+  if (code != ncclSuccess) {
+    std::stringstream ss;
+    ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
+    ss << file << "(" << line << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  return code;
+}
+#endif
+
 #define gpuErrchk(ans) \
   { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line,
@@ -53,6 +78,55 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
   }
 }
 
+inline int n_visible_devices() {
+  int n_visgpus = 0;
+
+  cudaGetDeviceCount(&n_visgpus);
+
+  return n_visgpus;
+}
+
+inline int n_devices_all(int n_gpus) {
+  if (NCCL == 0 && n_gpus > 1 || NCCL == 0 && n_gpus != 0) {
+    if (n_gpus != 1 && n_gpus != 0) {
+      fprintf(stderr, "NCCL=0, so forcing n_gpus=1\n");
+      fflush(stderr);
+    }
+    n_gpus = 1;
+  }
+  int n_devices_visible = dh::n_visible_devices();
+  int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
+  return (n_devices);
+}
+inline int n_devices(int n_gpus, int num_rows) {
+  int n_devices = dh::n_devices_all(n_gpus);
+  // fix-up device number to be limited by number of rows
+  n_devices = n_devices > num_rows ? num_rows : n_devices;
+  return (n_devices);
+}
+
+// if n_devices=-1, then use all visible devices
+inline void synchronize_n_devices(int n_devices, std::vector<int> dList) {
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    safe_cuda(cudaSetDevice(device_idx));
+    safe_cuda(cudaDeviceSynchronize());
+  }
+}
+inline void synchronize_all() {
+  for (int device_idx = 0; device_idx < n_visible_devices(); device_idx++) {
+    safe_cuda(cudaSetDevice(device_idx));
+    safe_cuda(cudaDeviceSynchronize());
+  }
+}
+
+inline std::string device_name(int device_idx) {
+  cudaDeviceProp prop;
+  dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
+  return std::string(prop.name);
+}
+
+
 /*
  *  Timers
  */
@@ -119,7 +193,9 @@ struct DeviceTimer {
 
 #ifdef DEVICE_TIMER
   __device__ DeviceTimer(DeviceTimerGlobal &GTimer, int slot)  // NOLINT
-      : GTimer(GTimer), start(clock()), slot(slot) {}
+      : GTimer(GTimer),
+        start(clock()),
+        slot(slot) {}
 #else
   __device__ DeviceTimer(DeviceTimerGlobal &GTimer, int slot) {}  // NOLINT
 #endif
@@ -146,8 +222,8 @@ struct Timer {
   void reset() { start = ClockT::now(); }
   int64_t elapsed() const { return (ClockT::now() - start).count(); }
   void printElapsed(std::string label) {
-    safe_cuda(cudaDeviceSynchronize());
-    printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
+    //    synchronize_n_devices(n_devices, dList);
+    printf("%s:\t %lld\n", label.c_str(), elapsed());
     reset();
   }
 };
@@ -229,43 +305,47 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
  * Memory
  */
 
+enum memory_type { DEVICE, DEVICE_MANAGED };
+
+template <memory_type MemoryT>
 class bulk_allocator;
 template <typename T> class dvec2;
 
 template <typename T>
 class dvec {
-  friend bulk_allocator;
-  friend class dvec2<T>;
-
+ friend class dvec2<T>;
  private:
   T *_ptr;
   size_t _size;
+  int _device_idx;
 
-  void external_allocate(void *ptr, size_t size) {
+ public:
+  void external_allocate(int device_idx, void *ptr, size_t size) {
     if (!empty()) {
       throw std::runtime_error("Tried to allocate dvec but already allocated");
     }
     _ptr = static_cast<T *>(ptr);
     _size = size;
+    _device_idx = device_idx;
   }
 
- public:
-  dvec() : _ptr(NULL), _size(0) {}
-
+  dvec() : _ptr(NULL), _size(0), _device_idx(0) {}
   size_t size() const { return _size; }
-
+  int device_idx() const { return _device_idx; }
   bool empty() const { return _ptr == NULL || _size == 0; }
 
   T *data() { return _ptr; }
 
   std::vector<T> as_vector() const {
     std::vector<T> h_vector(size());
+    safe_cuda(cudaSetDevice(_device_idx));
     safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
                          cudaMemcpyDeviceToHost));
     return h_vector;
   }
 
   void fill(T value) {
+    safe_cuda(cudaSetDevice(_device_idx));
     thrust::fill_n(thrust::device_pointer_cast(_ptr), size(), value);
   }
 
@@ -285,11 +365,7 @@ class dvec {
 
   template <typename T2>
   dvec &operator=(const std::vector<T2> &other) {
-    if (other.size() != size()) {
-      throw std::runtime_error(
-          "Cannot copy assign vector to dvec, sizes are different");
-    }
-    thrust::copy(other.begin(), other.end(), this->tbegin());
+    this->copy(other.begin(), other.end());
     return *this;
   }
 
@@ -298,9 +374,25 @@ class dvec {
       throw std::runtime_error(
           "Cannot copy assign dvec to dvec, sizes are different");
     }
-    thrust::copy(other.tbegin(), other.tend(), this->tbegin());
+    safe_cuda(cudaSetDevice(this->device_idx()));
+    if (other.device_idx() == this->device_idx()) {
+      thrust::copy(other.tbegin(), other.tend(), this->tbegin());
+    } else {
+      throw std::runtime_error("Cannot copy to/from different devices");
+    }
+
     return *this;
   }
+
+  template <typename IterT>
+  void copy(IterT begin, IterT end) {
+    safe_cuda(cudaSetDevice(this->device_idx()));
+    if (end - begin != size()) {
+      throw std::runtime_error(
+          "Cannot copy assign vector to dvec, sizes are different");
+    }
+    thrust::copy(begin, end, this->tbegin());
+  }
 };
 
 /**
@@ -309,34 +401,34 @@ class dvec {
  */
 template <typename T>
 class dvec2 {
-  friend bulk_allocator;
 
  private:
   dvec<T> _d1, _d2;
   cub::DoubleBuffer<T> _buff;
+  int _device_idx;
 
-  void external_allocate(void *ptr1, void *ptr2, size_t size) {
+
+ public:
+  void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
     if (!empty()) {
       throw std::runtime_error("Tried to allocate dvec2 but already allocated");
     }
-    _d1.external_allocate(ptr1, size);
-    _d2.external_allocate(ptr2, size);
+    _d1.external_allocate(_device_idx, ptr1, size);
+    _d2.external_allocate(_device_idx, ptr2, size);
     _buff.d_buffers[0] = static_cast<T *>(ptr1);
     _buff.d_buffers[1] = static_cast<T *>(ptr2);
     _buff.selector = 0;
+    _device_idx = device_idx;
   }
-
- public:
-  dvec2() : _d1(), _d2(), _buff() {}
+  dvec2() : _d1(), _d2(), _buff(), _device_idx(0) {}
 
   size_t size() const { return _d1.size(); }
-
+  int device_idx() const { return _device_idx; }
   bool empty() const { return _d1.empty() || _d2.empty(); }
 
   cub::DoubleBuffer<T> &buff() { return _buff; }
 
   dvec<T> &d1() { return _d1; }
-
   dvec<T> &d2() { return _d2; }
 
   T *current() { return _buff.Current(); }
@@ -346,9 +438,11 @@ class dvec2 {
   T *other() { return _buff.Alternate(); }
 };
 
+template <memory_type MemoryT>
 class bulk_allocator {
-  char *d_ptr;
-  size_t _size;
+  std::vector<char *> d_ptr;
+  std::vector<size_t> _size;
+  std::vector<int> _device_idx;
 
   const int align = 256;
 
@@ -369,18 +463,32 @@ class bulk_allocator {
   }
 
   template <typename T, typename SizeT>
-  void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size) {
-    first_vec->external_allocate(static_cast<void *>(ptr), first_size);
+  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
+                     SizeT first_size) {
+    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
+                                 first_size);
   }
 
   template <typename T, typename SizeT, typename... Args>
-  void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
-                     Args... args) {
-    allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
+  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
+                     SizeT first_size, Args... args) {
+    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
+                                 first_size);
     ptr += align_round_up(first_size * sizeof(T));
-    allocate_dvec(ptr, args...);
+    allocate_dvec(device_idx, ptr, args...);
   }
 
+  //    template <memory_type MemoryT>
+  char *allocate_device(int device_idx, size_t bytes, memory_type t) {
+    char *ptr;
+    if (t == memory_type::DEVICE) {
+      safe_cuda(cudaSetDevice(device_idx));
+      safe_cuda(cudaMalloc(&ptr, bytes));
+    } else {
+      safe_cuda(cudaMallocManaged(&ptr, bytes));
+    }
+    return ptr;
+  }
   template <typename T, typename SizeT>
   size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
     return 2 * align_round_up(first_size * sizeof(T));
@@ -392,40 +500,46 @@ class bulk_allocator {
   }
 
   template <typename T, typename SizeT>
-  void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
-    first_vec->external_allocate
-        (static_cast<void *>(ptr),
+  void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size) {
+    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
          static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
          first_size);
   }
 
   template <typename T, typename SizeT, typename... Args>
-  void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
+  void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size,
                      Args... args) {
-    allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
+    allocate_dvec<T,SizeT>(device_idx, ptr, first_vec, first_size);
     ptr += (align_round_up(first_size * sizeof(T)) * 2);
-    allocate_dvec(ptr, args...);
+    allocate_dvec(device_idx, ptr, args...);
   }
 
  public:
-  bulk_allocator() : _size(0), d_ptr(NULL) {}
-
   ~bulk_allocator() {
-    if (!(d_ptr == nullptr)) {
-      safe_cuda(cudaFree(d_ptr));
+    for (int i = 0; i < d_ptr.size(); i++) {
+      if (!(d_ptr[i] == nullptr)) {
+        safe_cuda(cudaSetDevice(_device_idx[i]));
+        safe_cuda(cudaFree(d_ptr[i]));
+      }
     }
   }
 
-  size_t size() { return _size; }
+  // returns sum of bytes for all allocations
+  size_t size() {
+    return std::accumulate(_size.begin(), _size.end(), static_cast<size_t>(0));
+  }
 
   template <typename... Args>
-  void allocate(Args... args) {
-    if (d_ptr != NULL) {
-      throw std::runtime_error("Bulk allocator already allocated");
-    }
-    _size = get_size_bytes(args...);
-    safe_cuda(cudaMalloc(&d_ptr, _size));
-    allocate_dvec(d_ptr, args...);
+  void allocate(int device_idx, Args... args) {
+    size_t size = get_size_bytes(args...);
+
+    char *ptr = allocate_device(device_idx, size, MemoryT);
+
+    allocate_dvec(device_idx, ptr, args...);
+
+    d_ptr.push_back(ptr);
+    _size.push_back(size);
+    _device_idx.push_back(device_idx);
   }
 };
 
@@ -455,19 +569,14 @@ struct CubMemory {
   bool IsAllocated() { return d_temp_storage != NULL; }
 };
 
-inline size_t available_memory() {
+inline size_t available_memory(int device_idx) {
   size_t device_free = 0;
   size_t device_total = 0;
-  safe_cuda(cudaMemGetInfo(&device_free, &device_total));
+  safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
   return device_free;
 }
 
-inline std::string device_name() {
-  cudaDeviceProp prop;
-  safe_cuda(cudaGetDeviceProperties(&prop, 0));
-  return std::string(prop.name);
-}
-
 /*
  *  Utility functions
  */
@@ -481,7 +590,7 @@ void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
   std::cout << "\n";
 }
 
-template <typename T>
+template <typename T, memory_type MemoryT>
 void print(const dvec<T> &v, size_t max_items = 10) {
   std::vector<T> h = v.as_vector();
   for (int i = 0; i < std::min(max_items, h.size()); i++) {
@@ -530,17 +639,46 @@ size_t size_bytes(const thrust::device_vector<T> &v) {
  */
 
 template <typename L>
-__global__ void launch_n_kernel(size_t n, L lambda) {
-  for (auto i : grid_stride_range(static_cast<size_t>(0), n)) {
+__global__ void launch_n_kernel(size_t begin, size_t end, L lambda) {
+  for (auto i : grid_stride_range(begin, end)) {
     lambda(i);
   }
 }
+template <typename L>
+__global__ void launch_n_kernel(int device_idx, size_t begin, size_t end,
+                                L lambda) {
+  for (auto i : grid_stride_range(begin, end)) {
+    lambda(i, device_idx);
+  }
+}
 
 template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
-inline void launch_n(size_t n, L lambda) {
+inline void launch_n(int device_idx, size_t n, L lambda) {
+  safe_cuda(cudaSetDevice(device_idx));
   const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
 #if defined(__CUDACC__)
-  launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
+  launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
+                                                lambda);
+#endif
+}
+
+// if n_devices=-1, then use all visible devices
+template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
+inline void multi_launch_n(size_t n, int n_devices, L lambda) {
+  n_devices = n_devices < 0 ? n_visible_devices() : n_devices;
+  CHECK_LE(n_devices, n_visible_devices()) << "Number of devices requested "
+                                              "needs to be less than equal to "
+                                              "number of visible devices.";
+  const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
+#if defined(__CUDACC__)
+  n_devices = n_devices > n ? n : n_devices;
+  for (int device_idx = 0; device_idx < n_devices; device_idx++) {
+    safe_cuda(cudaSetDevice(device_idx));
+    size_t begin = (n / n_devices) * device_idx;
+    size_t end = std::min((n / n_devices) * (device_idx + 1), n);
+    launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(device_idx, begin, end,
+                                                  lambda);
+  }
 #endif
 }
 
diff --git a/plugin/updater_gpu/src/exact/argmax_by_key.cuh b/plugin/updater_gpu/src/exact/argmax_by_key.cuh
index 181157939..d4f2a4333 100644
--- a/plugin/updater_gpu/src/exact/argmax_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/argmax_by_key.cuh
@@ -168,7 +168,7 @@ void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
                  const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
                  node_id_t nodeStart, int len, const TrainParam param,
                  ArgMaxByKeyAlgo algo) {
-  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
+  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
   int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
   switch(algo) {
   case ABK_GMEM:
diff --git a/plugin/updater_gpu/src/exact/gpu_builder.cuh b/plugin/updater_gpu/src/exact/gpu_builder.cuh
index 750eb3f59..6cf38b4b2 100644
--- a/plugin/updater_gpu/src/exact/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/exact/gpu_builder.cuh
@@ -208,7 +208,7 @@ private:
   dh::dvec<gpu_gpair> tmpScanGradBuff;
   dh::dvec<int> tmpScanKeyBuff;
   dh::dvec<int> colIds;
-  dh::bulk_allocator ba;
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
 
   void findSplit(int level, node_id_t nodeStart, int nNodes) {
     reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
@@ -226,7 +226,8 @@ private:
 
   void allocateAllData(int offsetSize) {
     int tmpBuffSize = scanTempBufferSize(nVals);
-    ba.allocate(&vals, nVals,
+    ba.allocate(param.gpu_id,
+                &vals, nVals,
                 &vals_cached, nVals,
                 &instIds, nVals,
                 &instIds_cached, nVals,
@@ -245,7 +246,7 @@ private:
   }
 
   void setupOneTimeData(DMatrix& hMat) {
-    size_t free_memory = dh::available_memory();
+    size_t free_memory = dh::available_memory(param.gpu_id);
     if (!hMat.SingleColBlock()) {
       throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
     }
@@ -258,7 +259,7 @@ private:
     if (!param.silent) {
       const int mb_size = 1048576;
       LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
-                   << free_memory / mb_size << " MB on " << dh::device_name();
+                   << free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
     }
   }
 
@@ -340,7 +341,7 @@ private:
                                       colOffsets.data(), vals.current(),
                                       nVals, nCols);
       // gather the node assignments across all other columns too
-      gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
+      gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
                         instIds.current(), nVals);
       sortKeys(level);
     }
@@ -351,7 +352,7 @@ private:
     // but we don't need more than level+1 bits for sorting!
     segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
                   0, level+1);
-    gather<float,int>(vals.other(), vals.current(), instIds.other(),
+    gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
                       instIds.current(), nodeLocations.current(), nVals);
     vals.buff().selector ^= 1;
     instIds.buff().selector ^= 1;
diff --git a/plugin/updater_gpu/src/functions.cuh b/plugin/updater_gpu/src/functions.cuh
index c9f2ff863..34426d7d5 100644
--- a/plugin/updater_gpu/src/functions.cuh
+++ b/plugin/updater_gpu/src/functions.cuh
@@ -2,14 +2,10 @@
  * Copyright 2016 Rory mitchell
 */
 #pragma once
-#include "types.cuh"
-#include "../../../src/tree/param.h"
 #include "../../../src/common/random.h"
-
+#include "../../../src/tree/param.h"
+#include "types.cuh"
 
 namespace xgboost {
-namespace tree {
-
-
-}  // namespace tree
+namespace tree {}  // namespace tree
 }  // namespace xgboost
diff --git a/plugin/updater_gpu/src/gpu_data.cuh b/plugin/updater_gpu/src/gpu_data.cuh
index 2cf74612d..82956215c 100644
--- a/plugin/updater_gpu/src/gpu_data.cuh
+++ b/plugin/updater_gpu/src/gpu_data.cuh
@@ -21,7 +21,8 @@ struct GPUData {
   int n_features;
   int n_instances;
 
-  dh::bulk_allocator ba;
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
+  // dh::bulk_allocator<int> ba;
   GPUTrainingParam param;
 
   dh::dvec<float> fvalues;
@@ -72,24 +73,25 @@ struct GPUData {
         n_features, foffsets.data(), foffsets.data() + 1);
 
     // Allocate memory
-    size_t free_memory = dh::available_memory();
-    ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
-                &fvalues_cached, in_fvalues.size(), &foffsets,
-                in_foffsets.size(), &instance_id, in_instance_id.size(),
-                &instance_id_temp, in_instance_id.size(), &instance_id_cached,
-                in_instance_id.size(), &feature_id, in_feature_id.size(),
-                &node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
-                &node_id_instance, n_instances, &gpair, n_instances, &nodes,
-                max_nodes, &split_candidates, max_nodes_level * n_features,
-                &node_sums, max_nodes_level * n_features, &node_offsets,
-                max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
-                &sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size,
-                &feature_flags, n_features, &feature_set, n_features);
+    size_t free_memory = dh::available_memory(param_in.gpu_id);
+    ba.allocate(param_in.gpu_id,
+                &fvalues, in_fvalues.size(), &fvalues_temp,
+        in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets,
+        in_foffsets.size(), &instance_id, in_instance_id.size(),
+        &instance_id_temp, in_instance_id.size(), &instance_id_cached,
+        in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
+        in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
+        n_instances, &gpair, n_instances, &nodes, max_nodes, &split_candidates,
+        max_nodes_level * n_features, &node_sums, max_nodes_level * n_features,
+        &node_offsets, max_nodes_level * n_features, &sort_index_in,
+        in_fvalues.size(), &sort_index_out, in_fvalues.size(), &cub_mem,
+        cub_mem_size, &feature_flags, n_features, &feature_set, n_features);
 
     if (!param_in.silent) {
       const int mb_size = 1048576;
       LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
-                   << free_memory / mb_size << " MB on " << dh::device_name();
+                   << free_memory / mb_size << " MB on "
+                   << dh::device_name(param_in.gpu_id);
     }
 
     fvalues_cached = in_fvalues;
@@ -134,9 +136,10 @@ struct GPUData {
     auto d_node_id_instance = node_id_instance.data();
     auto d_instance_id = instance_id.data();
 
-    dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
-      d_node_id[i] = d_node_id_instance[d_instance_id[i]];
-    });
+    dh::launch_n(node_id.device_idx(), fvalues.size(),
+                 [=] __device__(bst_uint i) {
+                   d_node_id[i] = d_node_id_instance[d_instance_id[i]];
+                 });
   }
 };
 }  // namespace tree
diff --git a/plugin/updater_gpu/src/gpu_hist_builder.cu b/plugin/updater_gpu/src/gpu_hist_builder.cu
index e70247a65..4370f433d 100644
--- a/plugin/updater_gpu/src/gpu_hist_builder.cu
+++ b/plugin/updater_gpu/src/gpu_hist_builder.cu
@@ -1,13 +1,14 @@
 /*!
  * Copyright 2017 Rory mitchell
  */
+#include <cub/cub.cuh>
 #include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
-#include <cub/cub.cuh>
 #include <algorithm>
 #include <functional>
+#include <future>
 #include <numeric>
 #include "common.cuh"
 #include "device_helpers.cuh"
@@ -16,16 +17,16 @@
 namespace xgboost {
 namespace tree {
 
-void DeviceGMat::Init(const common::GHistIndexMatrix& gmat) {
-  CHECK_EQ(gidx.size(), gmat.index.size())
-      << "gidx must be externally allocated";
-  CHECK_EQ(ridx.size(), gmat.index.size())
-      << "ridx must be externally allocated";
+void DeviceGMat::Init(int device_idx, const common::GHistIndexMatrix& gmat,
+                      bst_uint begin, bst_uint end) {
+  dh::safe_cuda(cudaSetDevice(device_idx));
+  CHECK_EQ(gidx.size(), end - begin) << "gidx must be externally allocated";
+  CHECK_EQ(ridx.size(), end - begin) << "ridx must be externally allocated";
 
-  gidx = gmat.index;
+  thrust::copy(&gmat.index[begin], &gmat.index[end], gidx.tbegin());
   thrust::device_vector<int> row_ptr = gmat.row_ptr;
 
-  auto counting = thrust::make_counting_iterator(0);
+  auto counting = thrust::make_counting_iterator(begin);
   thrust::upper_bound(row_ptr.begin(), row_ptr.end(), counting,
                       counting + gidx.size(), ridx.tbegin());
   thrust::transform(ridx.tbegin(), ridx.tend(), ridx.tbegin(),
@@ -34,19 +35,22 @@ void DeviceGMat::Init(const common::GHistIndexMatrix& gmat) {
 
 void DeviceHist::Init(int n_bins_in) {
   this->n_bins = n_bins_in;
-  CHECK(!hist.empty()) << "DeviceHist must be externally allocated";
+  CHECK(!data.empty()) << "DeviceHist must be externally allocated";
 }
 
-void DeviceHist::Reset() { hist.fill(gpu_gpair()); }
+void DeviceHist::Reset(int device_idx) {
+  cudaSetDevice(device_idx);
+  data.fill(gpu_gpair());
+}
 
 gpu_gpair* DeviceHist::GetLevelPtr(int depth) {
-  return hist.data() + n_nodes(depth - 1) * n_bins;
+  return data.data() + n_nodes(depth - 1) * n_bins;
 }
 
 int DeviceHist::LevelSize(int depth) { return n_bins * n_nodes_level(depth); }
 
 HistBuilder DeviceHist::GetBuilder() {
-  return HistBuilder(hist.data(), n_bins);
+  return HistBuilder(data.data(), n_bins);
 }
 
 HistBuilder::HistBuilder(gpu_gpair* ptr, int n_bins)
@@ -54,7 +58,11 @@ HistBuilder::HistBuilder(gpu_gpair* ptr, int n_bins)
 
 __device__ void HistBuilder::Add(gpu_gpair gpair, int gidx, int nidx) const {
   int hist_idx = nidx * n_bins + gidx;
-  atomicAdd(&(d_hist[hist_idx]._grad), gpair._grad);
+  atomicAdd(&(d_hist[hist_idx]._grad), gpair._grad);  // OPTMARK: This and below
+                                                      // line lead to about 3X
+                                                      // slowdown due to memory
+                                                      // dependency and access
+                                                      // pattern issues.
   atomicAdd(&(d_hist[hist_idx]._hess), gpair._hess);
 }
 
@@ -68,7 +76,24 @@ GPUHistBuilder::GPUHistBuilder()
       p_last_fmat_(nullptr),
       prediction_cache_initialised(false) {}
 
-GPUHistBuilder::~GPUHistBuilder() {}
+GPUHistBuilder::~GPUHistBuilder() {
+#if (NCCL)
+  if (initialised) {
+    for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+      ncclCommDestroy(comms[d_idx]);
+
+      dh::safe_cuda(cudaSetDevice(dList[d_idx]));
+      dh::safe_cuda(cudaStreamDestroy(*(streams[d_idx])));
+    }
+    for (int num_d = 1; num_d <= n_devices;
+         ++num_d) {  // loop over number of devices used
+      for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+        ncclCommDestroy(find_split_comms[num_d - 1][d_idx]);
+      }
+    }
+  }
+#endif
+}
 
 void GPUHistBuilder::Init(const TrainParam& param) {
   CHECK(param.max_depth < 16) << "Tree depth too large.";
@@ -76,181 +101,345 @@ void GPUHistBuilder::Init(const TrainParam& param) {
       << "Loss guided growth policy not supported. Use CPU algorithm.";
   this->param = param;
 
-  dh::safe_cuda(cudaSetDevice(param.gpu_id));
-  if (!param.silent) {
-    LOG(CONSOLE) << "Device: [" << param.gpu_id << "] " << dh::device_name();
+  CHECK(param.n_gpus != 0) << "Must have at least one device";
+  int n_devices_all = dh::n_devices_all(param.n_gpus);
+  for (int device_idx = 0; device_idx < n_devices; device_idx++) {
+    if (!param.silent) {
+      size_t free_memory = dh::available_memory(device_idx);
+      const int mb_size = 1048576;
+      LOG(CONSOLE) << "Device: [" << device_idx << "] "
+                   << dh::device_name(device_idx) << " with "
+                   << free_memory / mb_size << " MB available device memory.";
+    }
   }
 }
+void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
+                              DMatrix& fmat,  // NOLINT
+                              const RegTree& tree) {
+  // set member num_rows and n_devices for rest of GPUHistBuilder members
+  info = &fmat.info();
+  num_rows = info->num_row;
+  n_devices = dh::n_devices(param.n_gpus, num_rows);
+
+  if (!initialised) {
+    // set dList member
+    dList.resize(n_devices);
+    for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+      int device_idx = (param.gpu_id + d_idx) % n_devices;
+      dList[d_idx] = device_idx;
+    }
+
+#if (NCCL)
+    // initialize nccl
+
+    comms.resize(n_devices);
+    streams.resize(n_devices);
+    dh::safe_nccl(ncclCommInitAll(comms.data(), n_devices,
+                                  dList.data()));  // initialize communicator
+                                                   // (One communicator per
+                                                   // process)
+
+    // printf("# NCCL: Using devices\n");
+    for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+      streams[d_idx] = reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
+      dh::safe_cuda(cudaSetDevice(dList[d_idx]));
+      dh::safe_cuda(cudaStreamCreate(streams[d_idx]));
+
+      int cudaDev;
+      int rank;
+      cudaDeviceProp prop;
+      dh::safe_nccl(ncclCommCuDevice(comms[d_idx], &cudaDev));
+      dh::safe_nccl(ncclCommUserRank(comms[d_idx], &rank));
+      dh::safe_cuda(cudaGetDeviceProperties(&prop, cudaDev));
+      // printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
+      //             prop.pciBusID, prop.name);
+      fflush(stdout);
+    }
+
+    // local find_split group of comms for each case of reduced number of GPUs
+    // to use
+    find_split_comms.resize(
+        n_devices, std::vector<ncclComm_t>(n_devices));  // TODO(JCM): Excessive, but
+                                                         // ok, and best to do
+                                                         // here instead of
+                                                         // repeatedly
+    for (int num_d = 1; num_d <= n_devices;
+         ++num_d) {  // loop over number of devices used
+      dh::safe_nccl(ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
+                                    dList.data()));  // initialize communicator
+                                                     // (One communicator per
+                                                     // process)
+    }
+
+#endif
+
+    CHECK(fmat.SingleColBlock()) << "grow_gpu_hist: must have single column "
+                                    "block. Try setting 'tree_method' "
+                                    "parameter to 'exact'";
+    is_dense = info->num_nonzero == info->num_col * info->num_row;
+    hmat_.Init(&fmat, param.max_bin);
+    gmat_.cut = &hmat_;
+    gmat_.Init(&fmat);
+    int n_bins = hmat_.row_ptr.back();
+    int n_features = hmat_.row_ptr.size() - 1;
+
+    // deliniate data onto multiple gpus
+    device_row_segments.push_back(0);
+    device_element_segments.push_back(0);
+    bst_uint offset = 0;
+    size_t shard_size = std::ceil(static_cast<double>(num_rows) / n_devices);
+    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+      int device_idx = dList[d_idx];
+      offset += shard_size;
+      offset = std::min(offset, num_rows);
+      device_row_segments.push_back(offset);
+      device_element_segments.push_back(gmat_.row_ptr[offset]);
+    }
+
+    // Build feature segments
+    std::vector<int> h_feature_segments;
+    for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) {
+      for (int fidx = 0; fidx < n_features; fidx++) {
+        h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins);
+      }
+    }
+    h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins);
+
+    // Construct feature map
+    std::vector<int> h_gidx_feature_map(n_bins);
+    for (int fidx = 0; fidx < n_features; fidx++) {
+      for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
+        h_gidx_feature_map[i] = fidx;
+      }
+    }
+
+    int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins;
+
+    // allocate unique common data that reside on master device (NOTE: None
+    // currently)
+    //    int master_device=dList[0];
+    //    ba.allocate(master_device, );
+
+    // allocate vectors across all devices
+    hist_vec.resize(n_devices);
+    nodes.resize(n_devices);
+    nodes_temp.resize(n_devices);
+    nodes_child_temp.resize(n_devices);
+    left_child_smallest.resize(n_devices);
+    left_child_smallest_temp.resize(n_devices);
+    feature_flags.resize(n_devices);
+    fidx_min_map.resize(n_devices);
+    feature_segments.resize(n_devices);
+    prediction_cache.resize(n_devices);
+    position.resize(n_devices);
+    position_tmp.resize(n_devices);
+    device_matrix.resize(n_devices);
+    device_gpair.resize(n_devices);
+    gidx_feature_map.resize(n_devices);
+    gidx_fvalue_map.resize(n_devices);
+
+    int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
+    find_split_n_devices =
+        std::min(n_nodes_level(param.max_depth), find_split_n_devices);
+    int max_num_nodes_device =
+        n_nodes_level(param.max_depth) / find_split_n_devices;
+
+    // num_rows_segment: for sharding rows onto gpus for splitting data
+    // num_elements_segment: for sharding rows (of elements) onto gpus for
+    // splitting data
+    // max_num_nodes_device: for sharding nodes onto gpus for split finding
+    // All other variables have full copy on gpu, with copy either being
+    // identical or just current portion (like for histogram) before AllReduce
+    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+      int device_idx = dList[d_idx];
+      bst_uint num_rows_segment =
+          device_row_segments[d_idx + 1] - device_row_segments[d_idx];
+      bst_uint num_elements_segment =
+          device_element_segments[d_idx + 1] - device_element_segments[d_idx];
+      ba.allocate(
+          device_idx, &(hist_vec[d_idx].data),
+          n_nodes(param.max_depth - 1) * n_bins, &nodes[d_idx],
+          n_nodes(param.max_depth), &nodes_temp[d_idx], max_num_nodes_device,
+          &nodes_child_temp[d_idx], max_num_nodes_device,
+          &left_child_smallest[d_idx], n_nodes(param.max_depth),
+          &left_child_smallest_temp[d_idx], max_num_nodes_device,
+          &feature_flags[d_idx],
+          n_features,  // may change but same on all devices
+          &fidx_min_map[d_idx],
+          hmat_.min_val.size(),  // constant and same on all devices
+          &feature_segments[d_idx],
+          h_feature_segments.size(),  // constant and same on all devices
+          &prediction_cache[d_idx], num_rows_segment, &position[d_idx],
+          num_rows_segment, &position_tmp[d_idx], num_rows_segment,
+          &device_gpair[d_idx], num_rows_segment, &device_matrix[d_idx].gidx,
+          num_elements_segment,  // constant and same on all devices
+          &device_matrix[d_idx].ridx,
+          num_elements_segment,              // constant and same on all devices
+          &gidx_feature_map[d_idx], n_bins,  // constant and same on all devices
+          &gidx_fvalue_map[d_idx],
+          hmat_.cut.size());  // constant and same on all devices
+
+      // Copy Host to Device (assumes comes after ba.allocate that sets device)
+      device_matrix[d_idx].Init(device_idx, gmat_,
+                                device_element_segments[d_idx],
+                                device_element_segments[d_idx + 1]);
+      gidx_feature_map[d_idx] = h_gidx_feature_map;
+      gidx_fvalue_map[d_idx] = hmat_.cut;
+      feature_segments[d_idx] = h_feature_segments;
+      fidx_min_map[d_idx] = hmat_.min_val;
+
+      // Initialize, no copy
+      hist_vec[d_idx].Init(n_bins);     // init host object
+      prediction_cache[d_idx].fill(0);  // init device object (assumes comes
+                                        // after ba.allocate that sets device)
+      feature_flags[d_idx].fill(1);  // init device object (assumes comes after
+                                     // ba.allocate that sets device)
+    }
+
+    if (!param.silent) {
+      const int mb_size = 1048576;
+      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << " MB";
+    }
+
+    initialised = true;
+  }
+
+  // copy or init to do every iteration
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+
+    nodes[d_idx].fill(Node());
+    nodes_temp[d_idx].fill(Node());
+    nodes_child_temp[d_idx].fill(Node());
+
+    position[d_idx].fill(0);
+
+    device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx],
+                             gpair.begin() + device_row_segments[d_idx + 1]);
+
+    subsample_gpair(&device_gpair[d_idx], param.subsample,
+                    device_row_segments[d_idx]);
+
+    hist_vec[d_idx].Reset(device_idx);
+
+    // left_child_smallest and left_child_smallest_temp don't need to be
+    // initialized
+  }
+
+  dh::synchronize_n_devices(n_devices, dList);
+
+  p_last_fmat_ = &fmat;
+}
 
 void GPUHistBuilder::BuildHist(int depth) {
-  auto d_ridx = device_matrix.ridx.data();
-  auto d_gidx = device_matrix.gidx.data();
-  auto d_position = position.data();
-  auto d_gpair = device_gpair.data();
-  auto hist_builder = hist.GetBuilder();
-  auto d_left_child_smallest = left_child_smallest.data();
+  //  dh::Timer time;
 
-  dh::launch_n(device_matrix.gidx.size(), [=] __device__(int idx) {
-    int ridx = d_ridx[idx];
-    int pos = d_position[ridx];
-    if (!is_active(pos, depth)) return;
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    size_t begin = device_element_segments[d_idx];
+    size_t end = device_element_segments[d_idx + 1];
+    size_t row_begin = device_row_segments[d_idx];
 
-    // Only increment smallest node
-    bool is_smallest =
-        (d_left_child_smallest[parent_nidx(pos)] && is_left_child(pos)) ||
-        (!d_left_child_smallest[parent_nidx(pos)] && !is_left_child(pos));
-    if (!is_smallest && depth > 0) return;
+    auto d_ridx = device_matrix[d_idx].ridx.data();
+    auto d_gidx = device_matrix[d_idx].gidx.data();
+    auto d_position = position[d_idx].data();
+    auto d_gpair = device_gpair[d_idx].data();
+    auto d_left_child_smallest = left_child_smallest[d_idx].data();
+    auto hist_builder = hist_vec[d_idx].GetBuilder();
 
-    int gidx = d_gidx[idx];
-    gpu_gpair gpair = d_gpair[ridx];
+    dh::launch_n(device_idx, end - begin, [=] __device__(int local_idx) {
+      int ridx = d_ridx[local_idx];             // OPTMARK: latency
+      int nidx = d_position[ridx - row_begin];  // OPTMARK: latency
+      if (!is_active(nidx, depth)) return;
 
-    hist_builder.Add(gpair, gidx, pos);
-  });
+      // Only increment smallest node
+      bool is_smallest =
+          (d_left_child_smallest[parent_nidx(nidx)] && is_left_child(nidx)) ||
+          (!d_left_child_smallest[parent_nidx(nidx)] && !is_left_child(nidx));
+      if (!is_smallest && depth > 0) return;
 
-  dh::safe_cuda(cudaDeviceSynchronize());
+      int gidx = d_gidx[local_idx];
+      gpu_gpair gpair = d_gpair[ridx - row_begin];
 
-  // Subtraction trick
-  int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;
-  if (depth > 0) {
-    dh::launch_n(n_sub_bins, [=] __device__(int idx) {
-      int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
-      bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
-      if (left_smallest) {
-        nidx++;  // If left is smallest switch to right child
-      }
-
-      int gidx = idx % hist_builder.n_bins;
-      gpu_gpair parent = hist_builder.Get(gidx, parent_nidx(nidx));
-      int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
-      gpu_gpair other = hist_builder.Get(gidx, other_nidx);
-      hist_builder.Add(parent - other, gidx, nidx);
+      hist_builder.Add(gpair, gidx, nidx);  // OPTMARK: This is slow, could use
+                                            // shared memory or cache results
+                                            // intead of writing to global
+                                            // memory every time in atomic way.
     });
   }
-  dh::safe_cuda(cudaDeviceSynchronize());
+
+  //  dh::safe_cuda(cudaDeviceSynchronize());
+  dh::synchronize_n_devices(n_devices, dList);
+
+//  time.printElapsed("Add Time");
+
+#if (NCCL)
+  // (in-place) reduce each element of histogram (for only current level) across
+  // multiple gpus
+  // TODO(JCM): use out of place with pre-allocated buffer, but then have to copy
+  // back on device
+  //  fprintf(stderr,"sizeof(gpu_gpair)/sizeof(float)=%d\n",sizeof(gpu_gpair)/sizeof(float));
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+    dh::safe_nccl(ncclAllReduce(
+        reinterpret_cast<const void*>(hist_vec[d_idx].GetLevelPtr(depth)),
+        reinterpret_cast<void*>(hist_vec[d_idx].GetLevelPtr(depth)),
+        hist_vec[d_idx].LevelSize(depth) * sizeof(gpu_gpair) / sizeof(float),
+        ncclFloat, ncclSum, comms[d_idx], *(streams[d_idx])));
+  }
+
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+    dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
+  }
+#else
+// if no NCCL, then presume only 1 GPU, then already correct
+#endif
+
+  //  time.printElapsed("Reduce-Add Time");
+
+  // Subtraction trick (applied to all devices in same way -- to avoid doing on
+  // master and then Bcast)
+  if (depth > 0) {
+    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+      int device_idx = dList[d_idx];
+      dh::safe_cuda(cudaSetDevice(device_idx));
+
+      auto hist_builder = hist_vec[d_idx].GetBuilder();
+      auto d_left_child_smallest = left_child_smallest[d_idx].data();
+      int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;
+
+      dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) {
+        int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
+        bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
+        if (left_smallest) {
+          nidx++;  // If left is smallest switch to right child
+        }
+
+        int gidx = idx % hist_builder.n_bins;
+        gpu_gpair parent = hist_builder.Get(gidx, parent_nidx(nidx));
+        int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
+        gpu_gpair other = hist_builder.Get(gidx, other_nidx);
+        hist_builder.Add(parent - other, gidx,
+                         nidx);  // OPTMARK: This is slow, could use shared
+                                 // memory or cache results intead of writing to
+                                 // global memory every time in atomic way.
+      });
+    }
+    dh::synchronize_n_devices(n_devices, dList);
+  }
 }
 
 template <int BLOCK_THREADS>
 __global__ void find_split_kernel(
     const gpu_gpair* d_level_hist, int* d_feature_segments, int depth,
-    int n_features, int n_bins, Node* d_nodes, float* d_fidx_min_map,
+    int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp,
+    Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map,
     float* d_gidx_fvalue_map, GPUTrainingParam gpu_param,
-    bool* d_left_child_smallest, bool colsample, int* d_feature_flags) {
-  typedef cub::KeyValuePair<int, float> ArgMaxT;
-  typedef cub::BlockScan<gpu_gpair, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
-      BlockScanT;
-  typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
-  typedef cub::BlockReduce<gpu_gpair, BLOCK_THREADS> SumReduceT;
-
-  union TempStorage {
-    typename BlockScanT::TempStorage scan;
-    typename MaxReduceT::TempStorage max_reduce;
-    typename SumReduceT::TempStorage sum_reduce;
-  };
-
-  struct UninitializedSplit : cub::Uninitialized<Split> {};
-  struct UninitializedGpair : cub::Uninitialized<gpu_gpair> {};
-
-  __shared__ UninitializedSplit uninitialized_split;
-  Split& split = uninitialized_split.Alias();
-  __shared__ ArgMaxT block_max;
-  __shared__ TempStorage temp_storage;
-
-  if (threadIdx.x == 0) {
-    split = Split();
-  }
-
-  __syncthreads();
-
-  int node_idx = n_nodes(depth - 1) + blockIdx.x;
-
-  for (int fidx = 0; fidx < n_features; fidx++) {
-    if (colsample && d_feature_flags[fidx] == 0) continue;
-
-    int begin = d_feature_segments[blockIdx.x * n_features + fidx];
-    int end = d_feature_segments[blockIdx.x * n_features + fidx + 1];
-    int gidx = (begin - (blockIdx.x * n_bins)) + threadIdx.x;
-    bool thread_active = threadIdx.x < end - begin;
-
-    // Scan histogram
-    gpu_gpair bin =
-        thread_active ? d_level_hist[begin + threadIdx.x] : gpu_gpair();
-
-    gpu_gpair feature_sum;
-    BlockScanT(temp_storage.scan)
-        .ExclusiveScan(bin, bin, gpu_gpair(), cub::Sum(), feature_sum);
-
-    // Calculate gain
-    gpu_gpair parent_sum = d_nodes[node_idx].sum_gradients;
-    float parent_gain = d_nodes[node_idx].root_gain;
-
-    gpu_gpair missing = parent_sum - feature_sum;
-
-    bool missing_left;
-    float gain = thread_active
-                     ? loss_chg_missing(bin, missing, parent_sum, parent_gain,
-                                        gpu_param, missing_left)
-                     : -FLT_MAX;
-    __syncthreads();
-
-    // Find thread with best gain
-    ArgMaxT tuple(threadIdx.x, gain);
-    ArgMaxT best = MaxReduceT(temp_storage.max_reduce)
-                       .Reduce(tuple, cub::ArgMax(), end - begin);
-
-    if (threadIdx.x == 0) {
-      block_max = best;
-    }
-
-    __syncthreads();
-
-    // Best thread updates split
-    if (threadIdx.x == block_max.key) {
-      float fvalue;
-      if (threadIdx.x == 0) {
-        fvalue = d_fidx_min_map[fidx];
-      } else {
-        fvalue = d_gidx_fvalue_map[gidx - 1];
-      }
-
-      gpu_gpair left = missing_left ? bin + missing : bin;
-      gpu_gpair right = parent_sum - left;
-
-      split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param);
-    }
-    __syncthreads();
-  }
-
-  // Create node
-  if (threadIdx.x == 0) {
-    d_nodes[node_idx].split = split;
-    if (depth == 0) {
-      // split.Print();
-    }
-
-    d_nodes[left_child_nidx(node_idx)] = Node(
-        split.left_sum,
-        CalcGain(gpu_param, split.left_sum.grad(), split.left_sum.hess()),
-        CalcWeight(gpu_param, split.left_sum.grad(), split.left_sum.hess()));
-
-    d_nodes[right_child_nidx(node_idx)] = Node(
-        split.right_sum,
-        CalcGain(gpu_param, split.right_sum.grad(), split.right_sum.hess()),
-        CalcWeight(gpu_param, split.right_sum.grad(), split.right_sum.hess()));
-
-    // Record smallest node
-    if (split.left_sum.hess() <= split.right_sum.hess()) {
-      d_left_child_smallest[node_idx] = true;
-    } else {
-      d_left_child_smallest[node_idx] = false;
-    }
-  }
-}
-template <int BLOCK_THREADS>
-__global__ void find_split_general_kernel(
-    const gpu_gpair* d_level_hist, int* d_feature_segments, int depth,
-    int n_features, int n_bins, Node* d_nodes, float* d_fidx_min_map,
-    float* d_gidx_fvalue_map, GPUTrainingParam gpu_param,
-    bool* d_left_child_smallest, bool colsample, int* d_feature_flags) {
+    bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) {
   typedef cub::KeyValuePair<int, float> ArgMaxT;
   typedef cub::BlockScan<gpu_gpair, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
       BlockScanT;
@@ -279,14 +468,17 @@ __global__ void find_split_general_kernel(
 
   __syncthreads();
 
-  int node_idx = n_nodes(depth - 1) + blockIdx.x;
+  // below two are for accessing full-sized node list stored on each device
+  // always one block per node, BLOCK_THREADS threads per block
+  int level_node_idx = blockIdx.x + nodes_offset_device;
+  int node_idx = n_nodes(depth - 1) + level_node_idx;
 
   for (int fidx = 0; fidx < n_features; fidx++) {
     if (colsample && d_feature_flags[fidx] == 0) continue;
 
-    int begin = d_feature_segments[blockIdx.x * n_features + fidx];
-    int end = d_feature_segments[blockIdx.x * n_features + fidx + 1];
-    int gidx = (begin - (blockIdx.x * n_bins)) + threadIdx.x;
+    int begin = d_feature_segments[level_node_idx * n_features + fidx];
+    int end = d_feature_segments[level_node_idx * n_features + fidx + 1];
+    int gidx = (begin - (level_node_idx * n_bins)) + threadIdx.x;
     bool thread_active = threadIdx.x < end - begin;
 
     gpu_gpair feature_sum = gpu_gpair();
@@ -359,26 +551,46 @@ __global__ void find_split_general_kernel(
 
   // Create node
   if (threadIdx.x == 0) {
-    d_nodes[node_idx].split = split;
-    if (depth == 0) {
-      // split.Print();
+    if (d_nodes_temp == NULL) {
+      d_nodes[node_idx].split = split;
+    } else {
+      d_nodes_temp[blockIdx.x] = d_nodes[node_idx];  // first copy node values
+      d_nodes_temp[blockIdx.x].split = split;        // now assign split
     }
 
-    d_nodes[left_child_nidx(node_idx)] = Node(
+    //    if (depth == 0) {
+    // split.Print();
+    //    }
+
+    Node *Nodeleft, *Noderight;
+    bool* left_child_smallest;
+    if (d_nodes_temp == NULL) {
+      Nodeleft = &d_nodes[left_child_nidx(node_idx)];
+      Noderight = &d_nodes[right_child_nidx(node_idx)];
+      left_child_smallest =
+          &d_left_child_smallest_temp[node_idx];  // NOTE: not per level, even
+                                                  // though _temp variable name
+    } else {
+      Nodeleft = &d_nodes_child_temp[blockIdx.x * 2 + 0];
+      Noderight = &d_nodes_child_temp[blockIdx.x * 2 + 1];
+      left_child_smallest = &d_left_child_smallest_temp[blockIdx.x];
+    }
+
+    *Nodeleft = Node(
         split.left_sum,
         CalcGain(gpu_param, split.left_sum.grad(), split.left_sum.hess()),
         CalcWeight(gpu_param, split.left_sum.grad(), split.left_sum.hess()));
 
-    d_nodes[right_child_nidx(node_idx)] = Node(
+    *Noderight = Node(
         split.right_sum,
         CalcGain(gpu_param, split.right_sum.grad(), split.right_sum.hess()),
         CalcWeight(gpu_param, split.right_sum.grad(), split.right_sum.hess()));
 
     // Record smallest node
     if (split.left_sum.hess() <= split.right_sum.hess()) {
-      d_left_child_smallest[node_idx] = true;
+      *left_child_smallest = true;
     } else {
-      d_left_child_smallest[node_idx] = false;
+      *left_child_smallest = false;
     }
   }
 }
@@ -393,60 +605,221 @@ void GPUHistBuilder::FindSplit(int depth) {
 
 template <>
 void GPUHistBuilder::FindSplitSpecialize<MAX_BLOCK_THREADS>(int depth) {
-  const int GRID_SIZE = n_nodes_level(depth);
-  bool colsample =
-      param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
-
-  find_split_general_kernel<
-      MAX_BLOCK_THREADS><<<GRID_SIZE, MAX_BLOCK_THREADS>>>(
-      hist.GetLevelPtr(depth), feature_segments.data(), depth, info->num_col,
-      hmat_.row_ptr.back(), nodes.data(), fidx_min_map.data(),
-      gidx_fvalue_map.data(), gpu_param, left_child_smallest.data(), colsample,
-      feature_flags.data());
-
-  dh::safe_cuda(cudaDeviceSynchronize());
+  LaunchFindSplit<MAX_BLOCK_THREADS>(depth);
 }
 template <int BLOCK_THREADS>
 void GPUHistBuilder::FindSplitSpecialize(int depth) {
   if (param.max_bin <= BLOCK_THREADS) {
-    const int GRID_SIZE = n_nodes_level(depth);
-    bool colsample =
-        param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
-
-    find_split_general_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
-        hist.GetLevelPtr(depth), feature_segments.data(), depth, info->num_col,
-        hmat_.row_ptr.back(), nodes.data(), fidx_min_map.data(),
-        gidx_fvalue_map.data(), gpu_param, left_child_smallest.data(),
-        colsample, feature_flags.data());
+    LaunchFindSplit<BLOCK_THREADS>(depth);
   } else {
     this->FindSplitSpecialize<BLOCK_THREADS + 32>(depth);
   }
-
-  dh::safe_cuda(cudaDeviceSynchronize());
 }
 
-void GPUHistBuilder::InitFirstNode() {
-  auto d_gpair = device_gpair.data();
-  auto d_node_sums = node_sums.data();
-  auto d_nodes = nodes.data();
-  auto gpu_param_alias = gpu_param;
+template <int BLOCK_THREADS>
+void GPUHistBuilder::LaunchFindSplit(int depth) {
+  bool colsample =
+      param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
 
-  size_t temp_storage_bytes;
-  cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, d_gpair, d_node_sums,
-                            device_gpair.size(), cub::Sum(), gpu_gpair());
-  cub_mem.LazyAllocate(temp_storage_bytes);
-  cub::DeviceReduce::Reduce(cub_mem.d_temp_storage, cub_mem.temp_storage_bytes,
-                            d_gpair, d_node_sums, device_gpair.size(),
-                            cub::Sum(), gpu_gpair());
+  // use power of 2 for split finder because nodes are power of 2 (broadcast
+  // result to remaining devices)
+  int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
+  find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
+  int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
+  int num_nodes_child_device = n_nodes_level(depth + 1) / find_split_n_devices;
+  const int GRID_SIZE = num_nodes_device;
 
-  dh::launch_n(1, [=] __device__(int idx) {
-    gpu_gpair sum_gradients = d_node_sums[idx];
-    d_nodes[idx] = Node(
-        sum_gradients,
-        CalcGain(gpu_param_alias, sum_gradients.grad(), sum_gradients.hess()),
-        CalcWeight(gpu_param_alias, sum_gradients.grad(),
-                   sum_gradients.hess()));
-  });
+#if (NCCL)
+  // NOTE: No need to scatter before gather as all devices have same copy of
+  // nodes, and within find_split_kernel() nodes_temp is given values from nodes
+
+  // for all nodes (split among devices) find best split per node
+  for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+
+    int nodes_offset_device = d_idx * num_nodes_device;
+    find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
+        (const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
+        feature_segments[d_idx].data(), depth, (info->num_col),
+        (hmat_.row_ptr.back()), nodes[d_idx].data(),
+        nodes_temp[d_idx].data(), nodes_child_temp[d_idx].data(),
+        nodes_offset_device, fidx_min_map[d_idx].data(),
+        gidx_fvalue_map[d_idx].data(), gpu_param,
+        left_child_smallest_temp[d_idx].data(), colsample,
+        feature_flags[d_idx].data());
+  }
+
+  // nccl only on devices that did split
+  dh::synchronize_n_devices(find_split_n_devices, dList);
+
+  for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+
+    dh::safe_nccl(ncclAllGather(
+        reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
+        num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
+        reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
+        find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx])));
+
+    if (depth !=
+        param.max_depth) {  // don't copy over children nodes if no more nodes
+      dh::safe_nccl(
+          ncclAllGather(reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
+                        num_nodes_child_device * sizeof(Node) / sizeof(char),
+                        ncclChar, reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
+                        find_split_comms[find_split_n_devices - 1][d_idx],
+                        *(streams[d_idx])));  // Note offset by n_nodes(depth)
+                                              // for recvbuff for child nodes
+    }
+
+    dh::safe_nccl(ncclAllGather(
+        reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
+        num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
+        reinterpret_cast<void*>(left_child_smallest[d_idx].data() + n_nodes(depth - 1)),
+        find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx])));
+  }
+
+  for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+    dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
+  }
+
+  if (n_devices > find_split_n_devices && n_devices > 1) {
+    // if n_devices==1, no need to Bcast
+    // if find_split_n_devices==1, this is just a copy operation, else it copies
+    // from master to all nodes in case extra devices not involved in split
+    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+      int device_idx = dList[d_idx];
+      dh::safe_cuda(cudaSetDevice(device_idx));
+
+      int master_device = dList[0];
+      dh::safe_nccl(
+          ncclBcast(reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
+                    n_nodes_level(depth) * sizeof(Node) / sizeof(char),
+                    ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
+
+      if (depth !=
+          param.max_depth) {  // don't copy over children nodes if no more nodes
+        dh::safe_nccl(ncclBcast(
+            reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
+            n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
+            master_device, comms[d_idx], *(streams[d_idx])));
+      }
+
+      dh::safe_nccl(ncclBcast(
+          reinterpret_cast<void*>(left_child_smallest[d_idx].data() + n_nodes(depth - 1)),
+          n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
+          master_device, comms[d_idx], *(streams[d_idx])));
+    }
+
+    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+      int device_idx = dList[d_idx];
+      dh::safe_cuda(cudaSetDevice(device_idx));
+      dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
+    }
+  }
+
+#else
+  {
+    int d_idx = 0;
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
+
+    int nodes_offset_device = d_idx * num_nodes_device;
+    find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
+        (const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
+        feature_segments[d_idx].data(), depth, (info->num_col),
+        (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
+        nodes_offset_device, fidx_min_map[d_idx].data(),
+        gidx_fvalue_map[d_idx].data(), gpu_param,
+        left_child_smallest[d_idx].data(), colsample,
+        feature_flags[d_idx].data());
+  }
+#endif
+
+  // NOTE: No need to syncrhonize with host as all above pure P2P ops or
+  // on-device ops
+}
+
+void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {
+#ifdef _WIN32
+  // Visual studio complains about C:/Program Files (x86)/Microsoft Visual
+  // Studio 14.0/VC/bin/../../VC/INCLUDE\utility(445): error : static assertion
+  // failed with "tuple index out of bounds"
+  // and C:/Program Files (x86)/Microsoft Visual Studio
+  // 14.0/VC/bin/../../VC/INCLUDE\future(1888): error : no instance of function
+  // template "std::_Invoke_stored" matches the argument list
+  std::vector<gpu_gpair> future_results(n_devices);
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+
+    auto begin = device_gpair[d_idx].tbegin();
+    auto end = device_gpair[d_idx].tend();
+    gpu_gpair init = gpu_gpair();
+    auto binary_op = thrust::plus<gpu_gpair>();
+
+    dh::safe_cuda(cudaSetDevice(device_idx));
+    future_results[d_idx] = thrust::reduce(begin, end, init, binary_op);
+  }
+
+  // sum over devices on host (with blocking get())
+  gpu_gpair sum = gpu_gpair();
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    sum += future_results[d_idx];
+  }
+#else
+  // asynch reduce per device
+
+  std::vector<std::future<gpu_gpair>> future_results(n_devices);
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+
+    auto begin = device_gpair[d_idx].tbegin();
+    auto end = device_gpair[d_idx].tend();
+    gpu_gpair init = gpu_gpair();
+    auto binary_op = thrust::plus<gpu_gpair>();
+
+    // std::async captures the algorithm parameters by value
+    // use std::launch::async to ensure the creation of a new thread
+    future_results[d_idx] = std::async(std::launch::async, [=] {
+      dh::safe_cuda(cudaSetDevice(device_idx));
+      return thrust::reduce(begin, end, init, binary_op);
+    });
+  }
+
+  // sum over devices on host (with blocking get())
+  gpu_gpair sum = gpu_gpair();
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    sum += future_results[d_idx].get();
+  }
+#endif
+
+  // Setup first node so all devices have same first node (here done same on all
+  // devices, or could have done one device and Bcast if worried about exact
+  // precision issues)
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+
+    auto d_nodes = nodes[d_idx].data();
+    auto gpu_param_alias = gpu_param;
+
+    dh::launch_n(device_idx, 1, [=] __device__(int idx) {
+      gpu_gpair sum_gradients = sum;
+      d_nodes[idx] = Node(
+          sum_gradients,
+          CalcGain(gpu_param_alias, sum_gradients.grad(), sum_gradients.hess()),
+          CalcWeight(gpu_param_alias, sum_gradients.grad(),
+                     sum_gradients.hess()));
+    });
+  }
+  // synch all devices to host before moving on (No, can avoid because BuildHist
+  // calls another kernel in default stream)
+  //  dh::synchronize_n_devices(n_devices, dList);
 }
 
 void GPUHistBuilder::UpdatePosition(int depth) {
@@ -458,101 +831,117 @@ void GPUHistBuilder::UpdatePosition(int depth) {
 }
 
 void GPUHistBuilder::UpdatePositionDense(int depth) {
-  auto d_position = position.data();
-  Node* d_nodes = nodes.data();
-  auto d_gidx_fvalue_map = gidx_fvalue_map.data();
-  auto d_gidx = device_matrix.gidx.data();
-  int n_columns = info->num_col;
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
 
-  int gidx_size = device_matrix.gidx.size();
+    auto d_position = position[d_idx].data();
+    Node* d_nodes = nodes[d_idx].data();
+    auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
+    auto d_gidx = device_matrix[d_idx].gidx.data();
+    int n_columns = info->num_col;
+    size_t begin = device_row_segments[d_idx];
+    size_t end = device_row_segments[d_idx + 1];
 
-  dh::launch_n(position.size(), [=] __device__(int idx) {
-    NodeIdT pos = d_position[idx];
-    if (!is_active(pos, depth)) {
-      return;
-    }
-    Node node = d_nodes[pos];
+    dh::launch_n(device_idx, end - begin, [=] __device__(int local_idx) {
+      NodeIdT pos = d_position[local_idx];
+      if (!is_active(pos, depth)) {
+        return;
+      }
+      Node node = d_nodes[pos];
 
-    if (node.IsLeaf()) {
-      return;
-    }
+      if (node.IsLeaf()) {
+        return;
+      }
 
-    int gidx = d_gidx[idx * n_columns + node.split.findex];
+      int gidx = d_gidx[local_idx * n_columns + node.split.findex];
 
-    float fvalue = d_gidx_fvalue_map[gidx];
-
-    if (fvalue <= node.split.fvalue) {
-      d_position[idx] = left_child_nidx(pos);
-    } else {
-      d_position[idx] = right_child_nidx(pos);
-    }
-  });
-
-  dh::safe_cuda(cudaDeviceSynchronize());
-}
-
-void GPUHistBuilder::UpdatePositionSparse(int depth) {
-  auto d_position = position.data();
-  auto d_position_tmp = position_tmp.data();
-  Node* d_nodes = nodes.data();
-  auto d_gidx_feature_map = gidx_feature_map.data();
-  auto d_gidx_fvalue_map = gidx_fvalue_map.data();
-  auto d_gidx = device_matrix.gidx.data();
-  auto d_ridx = device_matrix.ridx.data();
-
-  // Update missing direction
-  dh::launch_n(position.size(), [=] __device__(int idx) {
-    NodeIdT pos = d_position[idx];
-    if (!is_active(pos, depth)) {
-      d_position_tmp[idx] = pos;
-      return;
-    }
-
-    Node node = d_nodes[pos];
-
-    if (node.IsLeaf()) {
-      d_position_tmp[idx] = pos;
-      return;
-    } else if (node.split.missing_left) {
-      d_position_tmp[idx] = pos * 2 + 1;
-    } else {
-      d_position_tmp[idx] = pos * 2 + 2;
-    }
-  });
-
-  dh::safe_cuda(cudaDeviceSynchronize());
-
-  // Update node based on fvalue where exists
-  dh::launch_n(device_matrix.gidx.size(), [=] __device__(int idx) {
-    int ridx = d_ridx[idx];
-    NodeIdT pos = d_position[ridx];
-    if (!is_active(pos, depth)) {
-      return;
-    }
-
-    Node node = d_nodes[pos];
-
-    if (node.IsLeaf()) {
-      return;
-    }
-
-    int gidx = d_gidx[idx];
-    int findex = d_gidx_feature_map[gidx];
-
-    if (findex == node.split.findex) {
       float fvalue = d_gidx_fvalue_map[gidx];
 
       if (fvalue <= node.split.fvalue) {
-        d_position_tmp[ridx] = left_child_nidx(pos);
+        d_position[local_idx] = left_child_nidx(pos);
       } else {
-        d_position_tmp[ridx] = right_child_nidx(pos);
+        d_position[local_idx] = right_child_nidx(pos);
       }
-    }
-  });
+    });
+  }
+  dh::synchronize_n_devices(n_devices, dList);
+  // dh::safe_cuda(cudaDeviceSynchronize());
+}
 
-  dh::safe_cuda(cudaDeviceSynchronize());
+void GPUHistBuilder::UpdatePositionSparse(int depth) {
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
 
-  position = position_tmp;
+    auto d_position = position[d_idx].data();
+    auto d_position_tmp = position_tmp[d_idx].data();
+    Node* d_nodes = nodes[d_idx].data();
+    auto d_gidx_feature_map = gidx_feature_map[d_idx].data();
+    auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
+    auto d_gidx = device_matrix[d_idx].gidx.data();
+    auto d_ridx = device_matrix[d_idx].ridx.data();
+
+    size_t row_begin = device_row_segments[d_idx];
+    size_t row_end = device_row_segments[d_idx + 1];
+    size_t element_begin = device_element_segments[d_idx];
+    size_t element_end = device_element_segments[d_idx + 1];
+
+    // Update missing direction
+    dh::launch_n(device_idx, row_end - row_begin,
+                 [=] __device__(int local_idx) {
+                   NodeIdT pos = d_position[local_idx];
+                   if (!is_active(pos, depth)) {
+                     d_position_tmp[local_idx] = pos;
+                     return;
+                   }
+
+                   Node node = d_nodes[pos];
+
+                   if (node.IsLeaf()) {
+                     d_position_tmp[local_idx] = pos;
+                     return;
+                   } else if (node.split.missing_left) {
+                     d_position_tmp[local_idx] = pos * 2 + 1;
+                   } else {
+                     d_position_tmp[local_idx] = pos * 2 + 2;
+                   }
+                 });
+
+    // Update node based on fvalue where exists
+    // OPTMARK: This kernel is very inefficient for both compute and memory,
+    // dominated by memory dependency / access patterns
+    dh::launch_n(
+        device_idx, element_end - element_begin, [=] __device__(int local_idx) {
+          int ridx = d_ridx[local_idx];
+          NodeIdT pos = d_position[ridx - row_begin];
+          if (!is_active(pos, depth)) {
+            return;
+          }
+
+          Node node = d_nodes[pos];
+
+          if (node.IsLeaf()) {
+            return;
+          }
+
+          int gidx = d_gidx[local_idx];
+          int findex = d_gidx_feature_map[gidx];  // OPTMARK: slowest global
+                                                  // memory access, maybe setup
+                                                  // position, gidx, etc. as
+                                                  // combined structure?
+
+          if (findex == node.split.findex) {
+            float fvalue = d_gidx_fvalue_map[gidx];
+
+            if (fvalue <= node.split.fvalue) {
+              d_position_tmp[ridx - row_begin] = left_child_nidx(pos);
+            } else {
+              d_position_tmp[ridx - row_begin] = right_child_nidx(pos);
+            }
+          }
+        });
+    position[d_idx] = position_tmp[d_idx];
+  }
+  dh::synchronize_n_devices(n_devices, dList);
 }
 
 void GPUHistBuilder::ColSampleTree() {
@@ -572,92 +961,18 @@ void GPUHistBuilder::ColSampleLevel() {
   for (auto fidx : feature_set_level) {
     h_feature_flags[fidx] = 1;
   }
-  feature_flags = h_feature_flags;
-}
 
-void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
-                              DMatrix& fmat,  // NOLINT
-                              const RegTree& tree) {
-  if (!initialised) {
-    CHECK(fmat.SingleColBlock()) << "grow_gpu_hist: must have single column "
-                                    "block. Try setting 'tree_method' "
-                                    "parameter to 'exact'";
-    info = &fmat.info();
-    is_dense = info->num_nonzero == info->num_col * info->num_row;
-    hmat_.Init(&fmat, param.max_bin);
-    gmat_.cut = &hmat_;
-    gmat_.Init(&fmat);
-    int n_bins = hmat_.row_ptr.back();
-    int n_features = hmat_.row_ptr.size() - 1;
+  // copy from Host to Device for all devices
+  //  for(auto &f:feature_flags){ // this doesn't set device as should
+  //    f = h_feature_flags;
+  //  }
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    dh::safe_cuda(cudaSetDevice(device_idx));
 
-    // Build feature segments
-    std::vector<int> h_feature_segments;
-    for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) {
-      for (int fidx = 0; fidx < hmat_.row_ptr.size() - 1; fidx++) {
-        h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins);
-      }
-    }
-    h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins);
-
-    int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins;
-
-    size_t free_memory = dh::available_memory();
-    ba.allocate(&gidx_feature_map, n_bins, &hist_node_segments,
-                n_nodes_level(param.max_depth - 1) + 1, &feature_segments,
-                h_feature_segments.size(), &gain, level_max_bins, &position,
-                gpair.size(), &position_tmp, gpair.size(), &nodes,
-                n_nodes(param.max_depth), &gidx_fvalue_map, hmat_.cut.size(),
-                &fidx_min_map, hmat_.min_val.size(), &argmax,
-                n_nodes_level(param.max_depth - 1), &node_sums,
-                n_nodes_level(param.max_depth - 1) * n_features, &hist_scan,
-                level_max_bins, &device_gpair, gpair.size(),
-                &device_matrix.gidx, gmat_.index.size(), &device_matrix.ridx,
-                gmat_.index.size(), &hist.hist,
-                n_nodes(param.max_depth - 1) * n_bins, &feature_flags,
-                n_features, &left_child_smallest, n_nodes(param.max_depth - 1),
-                &prediction_cache, gpair.size());
-
-    if (!param.silent) {
-      const int mb_size = 1048576;
-      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
-                   << free_memory / mb_size << " MB on " << dh::device_name();
-    }
-
-    // Construct feature map
-    std::vector<int> h_gidx_feature_map(n_bins);
-    for (int row = 0; row < hmat_.row_ptr.size() - 1; row++) {
-      for (int i = hmat_.row_ptr[row]; i < hmat_.row_ptr[row + 1]; i++) {
-        h_gidx_feature_map[i] = row;
-      }
-    }
-
-    gidx_feature_map = h_gidx_feature_map;
-
-    // Construct device matrix
-    device_matrix.Init(gmat_);
-
-    gidx_fvalue_map = hmat_.cut;
-    fidx_min_map = hmat_.min_val;
-
-    thrust::sequence(hist_node_segments.tbegin(), hist_node_segments.tend(), 0,
-                     n_bins);
-
-    feature_flags.fill(1);
-
-    feature_segments = h_feature_segments;
-
-    hist.Init(n_bins);
-
-    prediction_cache.fill(0);
-
-    initialised = true;
+    feature_flags[d_idx] = h_feature_flags;
   }
-  nodes.fill(Node());
-  position.fill(0);
-  device_gpair = gpair;
-  subsample_gpair(&device_gpair, param.subsample);
-  hist.Reset();
-  p_last_fmat_ = &fmat;
+  dh::synchronize_n_devices(n_devices, dList);
 }
 
 bool GPUHistBuilder::UpdatePredictionCache(
@@ -667,25 +982,40 @@ bool GPUHistBuilder::UpdatePredictionCache(
   if (nodes.empty() || !p_last_fmat_ || data != p_last_fmat_) {
     return false;
   }
-  CHECK_EQ(prediction_cache.size(), out_preds.size());
 
   if (!prediction_cache_initialised) {
-    prediction_cache = out_preds;
+    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+      int device_idx = dList[d_idx];
+      size_t row_begin = device_row_segments[d_idx];
+      size_t row_end = device_row_segments[d_idx + 1];
+
+      prediction_cache[d_idx].copy(out_preds.begin() + row_begin,
+                                   out_preds.begin() + row_end);
+    }
     prediction_cache_initialised = true;
   }
+  dh::synchronize_n_devices(n_devices, dList);
 
-  auto d_nodes = nodes.data();
-  auto d_position = position.data();
-  auto d_prediction_cache = prediction_cache.data();
   float eps = param.learning_rate;
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    size_t row_begin = device_row_segments[d_idx];
+    size_t row_end = device_row_segments[d_idx + 1];
 
-  dh::launch_n(prediction_cache.size(), [=] __device__(int idx) {
-    int pos = d_position[idx];
-    d_prediction_cache[idx] += d_nodes[pos].weight * eps;
-  });
+    auto d_nodes = nodes[d_idx].data();
+    auto d_position = position[d_idx].data();
+    auto d_prediction_cache = prediction_cache[d_idx].data();
 
-  thrust::copy(prediction_cache.tbegin(), prediction_cache.tend(),
-               out_preds.data());
+    dh::launch_n(device_idx, prediction_cache[d_idx].size(),
+                 [=] __device__(int local_idx) {
+                   int pos = d_position[local_idx];
+                   d_prediction_cache[local_idx] += d_nodes[pos].weight * eps;
+                 });
+
+    thrust::copy(prediction_cache[d_idx].tbegin(),
+                 prediction_cache[d_idx].tend(), &out_preds[row_begin]);
+  }
+  dh::synchronize_n_devices(n_devices, dList);
 
   return true;
 }
@@ -693,15 +1023,32 @@ bool GPUHistBuilder::UpdatePredictionCache(
 void GPUHistBuilder::Update(const std::vector<bst_gpair>& gpair,
                             DMatrix* p_fmat, RegTree* p_tree) {
   this->InitData(gpair, *p_fmat, *p_tree);
-  this->InitFirstNode();
+  this->InitFirstNode(gpair);
   this->ColSampleTree();
+  //  long long int elapsed=0;
   for (int depth = 0; depth < param.max_depth; depth++) {
     this->ColSampleLevel();
+
+    //    dh::Timer time;
     this->BuildHist(depth);
+    //    elapsed+=time.elapsed();
+    //    printf("depth=%d\n",depth);
+    //    time.printElapsed("BH Time");
+
+    //    dh::Timer timesplit;
     this->FindSplit(depth);
+    //    timesplit.printElapsed("FS Time");
+
+    //    dh::Timer timeupdatepos;
     this->UpdatePosition(depth);
+    //    timeupdatepos.printElapsed("UP Time");
   }
-  dense2sparse_tree(p_tree, nodes.tbegin(), nodes.tend(), param);
+  //  printf("Total BuildHist Time=%lld\n",elapsed);
+
+  // done with multi-GPU, pass back result from master to tree on host
+  int master_device = dList[0];
+  dense2sparse_tree(p_tree, nodes[master_device].tbegin(),
+                    nodes[master_device].tend(), param);
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/plugin/updater_gpu/src/gpu_hist_builder.cuh b/plugin/updater_gpu/src/gpu_hist_builder.cuh
index fb9a5068c..d553b9ce0 100644
--- a/plugin/updater_gpu/src/gpu_hist_builder.cuh
+++ b/plugin/updater_gpu/src/gpu_hist_builder.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
  */
 #pragma once
 #include <thrust/device_vector.h>
@@ -11,6 +11,14 @@
 #include "device_helpers.cuh"
 #include "types.cuh"
 
+#ifndef NCCL
+#define NCCL 1
+#endif
+
+#if (NCCL)
+#include "nccl.h"
+#endif
+
 namespace xgboost {
 
 namespace tree {
@@ -18,7 +26,8 @@ namespace tree {
 struct DeviceGMat {
   dh::dvec<int> gidx;
   dh::dvec<int> ridx;
-  void Init(const common::GHistIndexMatrix &gmat);
+  void Init(int device_idx, const common::GHistIndexMatrix &gmat,
+            bst_uint begin, bst_uint end);
 };
 
 struct HistBuilder {
@@ -31,11 +40,11 @@ struct HistBuilder {
 
 struct DeviceHist {
   int n_bins;
-  dh::dvec<gpu_gpair> hist;
+  dh::dvec<gpu_gpair> data;
 
   void Init(int max_depth);
 
-  void Reset();
+  void Reset(int device_idx);
 
   HistBuilder GetBuilder();
 
@@ -64,7 +73,9 @@ class GPUHistBuilder {
   void FindSplit(int depth);
   template <int BLOCK_THREADS>
   void FindSplitSpecialize(int depth);
-  void InitFirstNode();
+  template <int BLOCK_THREADS>
+  void LaunchFindSplit(int depth);
+  void InitFirstNode(const std::vector<bst_gpair> &gpair);
   void UpdatePosition(int depth);
   void UpdatePositionDense(int depth);
   void UpdatePositionSparse(int depth);
@@ -80,32 +91,48 @@ class GPUHistBuilder {
   MetaInfo *info;
   bool initialised;
   bool is_dense;
-  DeviceGMat device_matrix;
   const DMatrix *p_last_fmat_;
-
-  dh::bulk_allocator ba;
-  dh::CubMemory cub_mem;
-  dh::dvec<int> gidx_feature_map;
-  dh::dvec<int> hist_node_segments;
-  dh::dvec<int> feature_segments;
-  dh::dvec<float> gain;
-  dh::dvec<NodeIdT> position;
-  dh::dvec<NodeIdT> position_tmp;
-  dh::dvec<float> gidx_fvalue_map;
-  dh::dvec<float> fidx_min_map;
-  DeviceHist hist;
-  dh::dvec<cub::KeyValuePair<int, float>> argmax;
-  dh::dvec<gpu_gpair> node_sums;
-  dh::dvec<gpu_gpair> hist_scan;
-  dh::dvec<gpu_gpair> device_gpair;
-  dh::dvec<Node> nodes;
-  dh::dvec<int> feature_flags;
-  dh::dvec<bool> left_child_smallest;
-  dh::dvec<bst_float> prediction_cache;
   bool prediction_cache_initialised;
 
+  // choose which memory type to use (DEVICE or DEVICE_MANAGED)
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
+  //  dh::bulk_allocator<dh::memory_type::DEVICE_MANAGED> ba; // can't be used
+  //  with NCCL
+  dh::CubMemory cub_mem;
+
   std::vector<int> feature_set_tree;
   std::vector<int> feature_set_level;
+
+  bst_uint num_rows;
+  int n_devices;
+
+  // below vectors are for each devices used
+  std::vector<int> dList;
+  std::vector<int> device_row_segments;
+  std::vector<int> device_element_segments;
+
+  std::vector<DeviceHist> hist_vec;
+  std::vector<dh::dvec<Node>> nodes;
+  std::vector<dh::dvec<Node>> nodes_temp;
+  std::vector<dh::dvec<Node>> nodes_child_temp;
+  std::vector<dh::dvec<bool>> left_child_smallest;
+  std::vector<dh::dvec<bool>> left_child_smallest_temp;
+  std::vector<dh::dvec<int>> feature_flags;
+  std::vector<dh::dvec<float>> fidx_min_map;
+  std::vector<dh::dvec<int>> feature_segments;
+  std::vector<dh::dvec<bst_float>> prediction_cache;
+  std::vector<dh::dvec<NodeIdT>> position;
+  std::vector<dh::dvec<NodeIdT>> position_tmp;
+  std::vector<DeviceGMat> device_matrix;
+  std::vector<dh::dvec<gpu_gpair>> device_gpair;
+  std::vector<dh::dvec<int>> gidx_feature_map;
+  std::vector<dh::dvec<float>> gidx_fvalue_map;
+
+  std::vector<cudaStream_t *> streams;
+#if (NCCL)
+  std::vector<ncclComm_t> comms;
+  std::vector<std::vector<ncclComm_t>> find_split_comms;
+#endif
 };
 }  // namespace tree
 }  // namespace xgboost
diff --git a/plugin/updater_gpu/src/types.cuh b/plugin/updater_gpu/src/types.cuh
index 95717dba6..bba2bb5d5 100644
--- a/plugin/updater_gpu/src/types.cuh
+++ b/plugin/updater_gpu/src/types.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
  */
 #pragma once
 #include <thrust/device_vector.h>
diff --git a/plugin/updater_gpu/src/updater_gpu.cu b/plugin/updater_gpu/src/updater_gpu.cu
index c0a02b27d..f08e54887 100644
--- a/plugin/updater_gpu/src/updater_gpu.cu
+++ b/plugin/updater_gpu/src/updater_gpu.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory Mitchell
+ * Copyright 2017 XGBoost contributors
  */
 #include <xgboost/tree_updater.h>
 #include <vector>
@@ -76,7 +76,7 @@ class GPUHistMaker : public TreeUpdater {
   }
 
   bool UpdatePredictionCache(const DMatrix* data,
-    std::vector<bst_float>* out_preds) override {
+                             std::vector<bst_float>* out_preds) override {
     return builder.UpdatePredictionCache(data, out_preds);
   }
 
diff --git a/plugin/updater_gpu/test/python/test.py b/plugin/updater_gpu/test/python/test.py
index c80be6d4f..10a6cf6cf 100644
--- a/plugin/updater_gpu/test/python/test.py
+++ b/plugin/updater_gpu/test/python/test.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #pylint: skip-file
 import sys
 sys.path.append("../../tests/python")
@@ -12,6 +13,10 @@ dpath = '../../demo/data/'
 ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
 ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
 
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+    print(*args, file=sys.stdout, **kwargs)
+        
 
 class TestGPU(unittest.TestCase):
     def test_grow_gpu(self):
@@ -58,7 +63,7 @@ class TestGPU(unittest.TestCase):
                  'max_depth': 3,
                  'eval_metric': 'auc'}
         res = {}
-        xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
+        xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
                   evals_result=res)
         assert self.non_decreasing(res['train']['auc'])
         assert self.non_decreasing(res['test']['auc'])
@@ -74,13 +79,13 @@ class TestGPU(unittest.TestCase):
                  'max_depth': 2,
                  'eval_metric': 'auc'}
         res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
+        xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
 
         assert self.non_decreasing(res['train']['auc'])
         assert res['train']['auc'][0] >= 0.85
 
         for j in range(X2.shape[1]):
-            for i in rng.choice(X2.shape[0], size=10, replace=False):
+            for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
                 X2[i, j] = 2
 
         dtrain3 = xgb.DMatrix(X2, label=y2)
@@ -92,17 +97,18 @@ class TestGPU(unittest.TestCase):
         assert res['train']['auc'][0] >= 0.85
 
         for j in range(X2.shape[1]):
-            for i in np.random.choice(X2.shape[0], size=10, replace=False):
+            for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
                 X2[i, j] = 3
 
         dtrain4 = xgb.DMatrix(X2, label=y2)
         res = {}
-        xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
+        xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
         assert self.non_decreasing(res['train']['auc'])
         assert res['train']['auc'][0] >= 0.85
 
-
+        
     def test_grow_gpu_hist(self):
+        n_gpus=-1
         tm._skip_if_no_sklearn()
         from sklearn.datasets import load_digits
         try:
@@ -110,122 +116,180 @@ class TestGPU(unittest.TestCase):
         except:
             from sklearn.cross_validation import train_test_split
 
-        # regression test --- hist must be same as exact on all-categorial data
-        ag_param = {'max_depth': 2,
-                    'tree_method': 'exact',
-                    'nthread': 1,
-                    'eta': 1,
-                    'silent': 1,
-                    'objective': 'binary:logistic',
-                    'eval_metric': 'auc'}
-        ag_param2 = {'max_depth': 2,
-                     'updater': 'grow_gpu_hist',
-                     'eta': 1,
-                     'silent': 1,
-                     'objective': 'binary:logistic',
-                     'eval_metric': 'auc'}
-        ag_res = {}
-        ag_res2 = {}
+        for max_depth in range(3,10): # TODO: Doesn't work with 2 for some tests
+            #eprint("max_depth=%d" % (max_depth))
+            
+            for max_bin_i in range(3,11):
+                max_bin = np.power(2,max_bin_i)
+                #eprint("max_bin=%d" % (max_bin))
 
-        num_rounds = 10
-        xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
-                  evals_result=ag_res)
-        xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
-                  evals_result=ag_res2)
-        assert ag_res['train']['auc'] == ag_res2['train']['auc']
-        assert ag_res['test']['auc'] == ag_res2['test']['auc']
+                
+            
+                # regression test --- hist must be same as exact on all-categorial data
+                ag_param = {'max_depth': max_depth,
+                            'tree_method': 'exact',
+                            'nthread': 1,
+                            'eta': 1,
+                            'silent': 1,
+                            'objective': 'binary:logistic',
+                            'eval_metric': 'auc'}
+                ag_param2 = {'max_depth': max_depth,
+                             'updater': 'grow_gpu_hist',
+                             'eta': 1,
+                             'silent': 1,
+                             'n_gpus': 1,
+                             'objective': 'binary:logistic',
+                                 'max_bin': max_bin,
+                             'eval_metric': 'auc'}
+                ag_param3 = {'max_depth': max_depth,
+                             'updater': 'grow_gpu_hist',
+                             'eta': 1,
+                             'silent': 1,
+                             'n_gpus': n_gpus,
+                                 'objective': 'binary:logistic',
+                                 'max_bin': max_bin,
+                             'eval_metric': 'auc'}
+                ag_res = {}
+                ag_res2 = {}
+                ag_res3 = {}
 
-        digits = load_digits(2)
-        X = digits['data']
-        y = digits['target']
-        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
-        dtrain = xgb.DMatrix(X_train, y_train)
-        dtest = xgb.DMatrix(X_test, y_test)
+                num_rounds = 10
+                #eprint("normal updater");
+                xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res)
+                #eprint("grow_gpu_hist updater 1 gpu");
+                xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res2)
+                #eprint("grow_gpu_hist updater %d gpus" % (n_gpus));
+                xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res3)
+                #        assert 1==0
+                assert ag_res['train']['auc'] == ag_res2['train']['auc']
+                assert ag_res['test']['auc'] == ag_res2['test']['auc']
+                assert ag_res['test']['auc'] == ag_res3['test']['auc']
 
-        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
-                 'max_depth': 3,
-                 'eval_metric': 'auc'}
-        res = {}
-        xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
-                  evals_result=res)
-        assert self.non_decreasing(res['train']['auc'])
-        assert self.non_decreasing(res['test']['auc'])
+                ######################################################################
+                digits = load_digits(2)
+                X = digits['data']
+                y = digits['target']
+                X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+                dtrain = xgb.DMatrix(X_train, y_train)
+                dtest = xgb.DMatrix(X_test, y_test)
 
-        # fail-safe test for dense data
-        from sklearn.datasets import load_svmlight_file
-        X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
-        X2 = X2.toarray()
-        dtrain2 = xgb.DMatrix(X2, label=y2)
+                param = {'objective': 'binary:logistic',
+                         'updater': 'grow_gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': 1,
+                         'max_bin': max_bin,
+                         'eval_metric': 'auc'}
+                res = {}
+                #eprint("digits: grow_gpu_hist updater 1 gpu");
+                xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
+                          evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                #assert self.non_decreasing(res['test']['auc'])
+                param2 = {'objective': 'binary:logistic',
+                          'updater': 'grow_gpu_hist',
+                          'max_depth': max_depth,
+                          'n_gpus': n_gpus,
+                          'max_bin': max_bin,
+                          'eval_metric': 'auc'}
+                res2 = {}
+                #eprint("digits: grow_gpu_hist updater %d gpus" % (n_gpus));
+                xgb.train(param2, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
+                          evals_result=res2)
+                assert self.non_decreasing(res2['train']['auc'])
+                #assert self.non_decreasing(res2['test']['auc'])
+                assert res['train']['auc'] == res2['train']['auc']
+                #assert res['test']['auc'] == res2['test']['auc']
 
-        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
-                 'max_depth': 2,
-                 'eval_metric': 'auc'}
-        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
+                ######################################################################
+                # fail-safe test for dense data
+                from sklearn.datasets import load_svmlight_file
+                X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
+                X2 = X2.toarray()
+                dtrain2 = xgb.DMatrix(X2, label=y2)
 
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
+                param = {'objective': 'binary:logistic',
+                         'updater': 'grow_gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'max_bin': max_bin,
+                         'eval_metric': 'auc'}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
 
-        for j in range(X2.shape[1]):
-            for i in rng.choice(X2.shape[0], size=10, replace=False):
-                X2[i, j] = 2
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
 
-        dtrain3 = xgb.DMatrix(X2, label=y2)
-        res = {}
+                for j in range(X2.shape[1]):
+                    for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
+                        X2[i, j] = 2
 
-        xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
+                dtrain3 = xgb.DMatrix(X2, label=y2)
+                res = {}
 
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
+                xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
 
-        for j in range(X2.shape[1]):
-            for i in np.random.choice(X2.shape[0], size=10, replace=False):
-                X2[i, j] = 3
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
 
-        dtrain4 = xgb.DMatrix(X2, label=y2)
-        res = {}
-        xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
+                for j in range(X2.shape[1]):
+                    for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
+                        X2[i, j] = 3
 
+                dtrain4 = xgb.DMatrix(X2, label=y2)
+                res = {}
+                xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+
+                ######################################################################
+                # fail-safe test for max_bin
+                param = {'objective': 'binary:logistic',
+                         'updater': 'grow_gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'eval_metric': 'auc',
+                         'max_bin': max_bin}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+                ######################################################################
+                # subsampling
+                param = {'objective': 'binary:logistic',
+                         'updater': 'grow_gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'eval_metric': 'auc',
+                         'colsample_bytree': 0.5,
+                         'colsample_bylevel': 0.5,
+                         'subsample': 0.5,
+                         'max_bin': max_bin}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+        ######################################################################
         # fail-safe test for max_bin=2
         param = {'objective': 'binary:logistic',
                  'updater': 'grow_gpu_hist',
                  'max_depth': 2,
+                 'n_gpus': n_gpus,
                  'eval_metric': 'auc',
                  'max_bin': 2}
         res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
+        xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
         assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
-
-        # subsampling
-        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
-                 'max_depth': 3,
-                 'eval_metric': 'auc',
-                 'colsample_bytree': 0.5,
-                 'colsample_bylevel': 0.5,
-                 'subsample': 0.5
-                 }
-        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
-
-        # max_bin = 2048
-        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
-                 'max_depth': 3,
-                 'eval_metric': 'auc',
-                 'max_bin': 2048
-                 }
-        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
-
+        if max_bin>32:
+            assert res['train']['auc'][0] >= 0.85
+        
+        
     def non_decreasing(self, L):
             return all((x - y) < 0.001 for x, y in zip(L, L[1:]))
diff --git a/src/tree/param.h b/src/tree/param.h
index 23048cde3..a09e2b59f 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -81,6 +81,8 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
   std::vector<int> monotone_constraints;
   // gpu to use for single gpu algorithms
   int gpu_id;
+  // number of GPUs to use
+  int n_gpus;
   // declare the parameters
   DMLC_DECLARE_PARAMETER(TrainParam) {
     DMLC_DECLARE_FIELD(learning_rate)
@@ -192,6 +194,10 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
         .set_lower_bound(0)
         .set_default(0)
         .describe("gpu to use for single gpu algorithms");
+    DMLC_DECLARE_FIELD(n_gpus)
+        .set_lower_bound(-1)
+        .set_default(-1)
+        .describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
     // add alias of parameters
     DMLC_DECLARE_ALIAS(reg_lambda, lambda);
     DMLC_DECLARE_ALIAS(reg_alpha, alpha);