[GPU-Plugin] Multi-GPU for grow_gpu_hist histogram method using NVIDIA NCCL. (#2395)

2017-06-11 13:06:08 -04:00 · 2017-06-11 13:06:08 -04:00 · 41efe32aa5
commit 41efe32aa5
parent e24f25e0c6
19 changed files with 2009 additions and 682 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -4,6 +4,9 @@
 [submodule "rabit"]
 	path = rabit
 	url = https://github.com/dmlc/rabit
+[submodule "nccl"]
+	path = nccl
+	url = https://github.com/dmlc/nccl
 [submodule "cub"]
 	path = cub
 	url = https://github.com/NVlabs/cub
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -94,40 +94,58 @@ if(MSVC)
 else()
    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
+    #Prevent shared library being called liblibxgboost.so on Linux
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
 endif()

 set(LINK_LIBRARIES dmlccore rabit)

 if(PLUGIN_UPDATER_GPU)
+    # nccl
+    set(LINK_LIBRARIES ${LINK_LIBRARIES} nccl)
+    add_subdirectory(nccl)
+    set(NCCL_DIRECTORY ${PROJECT_SOURCE_DIR}/nccl)
+    include_directories(${NCCL_DIRECTORY}/src)
+    set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
+
 	#Find cub
-    set(CUB_DIRECTORY "cub/" CACHE PATH "CUB 1.5.4 directory")
+	set(CUB_DIRECTORY ${PROJECT_SOURCE_DIR}/cub/)
 	include_directories(${CUB_DIRECTORY})
+
    #Find googletest
    set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
    include_directories(${GTEST_DIRECTORY}/include)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_60;-lineinfo;")
-    if(NOT MSVC)
-      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
-    endif()
+
+    # plugin
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lineinfo;--expt-extended-lambda")
    set(CUDA_SOURCES
      plugin/updater_gpu/src/updater_gpu.cu
      plugin/updater_gpu/src/gpu_hist_builder.cu
      )
-    cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
-    set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
+    include(${PROJECT_SOURCE_DIR}/cmake/Utils.cmake)
+    include(${PROJECT_SOURCE_DIR}/cmake/Cuda.cmake)
+    # use below for forcing specific arch
+    #cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS} -arch=compute_52)
+    # use below for auto-detect, but gpu_grow currently doesn't work with 61
+    xgboost_cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
+    if(MSVC)
+    else()
+      cuda_add_library(updater_gpu STATIC ${CUDA_SOURCES})
+      set(LINK_LIBRARIES ${LINK_LIBRARIES} updater_gpu)
+    endif()
 else()
    set(CUDA_OBJS "")
+    set(updater_gpu "")
 endif()

 add_library(objxgboost OBJECT ${SOURCES})
 set_target_properties(${objxgboost} PROPERTIES POSITION_INDEPENDENT_CODE 1)

-add_executable(runxgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
-set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
-target_link_libraries(runxgboost ${LINK_LIBRARIES})
+add_library(libxgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
+add_executable(xgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})

-add_library(xgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
 target_link_libraries(xgboost ${LINK_LIBRARIES})
+target_link_libraries(libxgboost ${LINK_LIBRARIES})

 option(JVM_BINDINGS "Build JVM bindings" OFF)

@ -136,11 +154,11 @@ if(JVM_BINDINGS)

    include_directories(${JNI_INCLUDE_DIRS} jvm-packages/xgboost4j/src/native)

-    add_library(xgboost4j SHARED
+    add_library(libxgboost4j SHARED
        $<TARGET_OBJECTS:objxgboost>
        ${CUDA_OBJS}
        jvm-packages/xgboost4j/src/native/xgboost4j.cpp)
-    target_link_libraries(xgboost4j
+    target_link_libraries(libxgboost4j
        ${LINK_LIBRARIES}
        ${JNI_LIBRARIES})
 endif()
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@ -0,0 +1,289 @@
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   mshadow_detect_installed_gpus(out_variable)
+function(xgboost_detect_installed_gpus out_variable)
+set(CUDA_gpu_detect_output "")
+  if(NOT CUDA_gpu_detect_output)
+    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${__cufile} ""
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    if(MSVC)
+      #find vcvarsall.bat and run it building msvc environment
+      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
+      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
+                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
+                      ERROR_QUIET
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    else()
+      if(CUDA_LIBRARY_PATH)
+        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
+      endif()
+      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
+                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
+                      ERROR_QUIET
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
+    if(__nvcc_res EQUAL 0)
+      # nvcc outputs text containing line breaks when building with MSVC.
+      # The line below prevents CMake from inserting a variable with line
+      # breaks in the cache
+      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
+      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
+      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from xgboost_detect_gpus tool" FORCE)
+    else()
+      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${xgboost_known_gpu_archs}).")
+    set(${out_variable} ${xgboost_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   xgboost_select_nvcc_arch_flags(out_variable)
+function(xgboost_select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(__archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND __archs_names "Auto")
+    set(__archs_name_default "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " __archs_names "${__archs_names}")
+    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${xgboost_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
+    set(__cuda_arch_bin "20 21(20)")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(__cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(__cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(__cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(__cuda_arch_bin ${xgboost_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    xgboost_detect_installed_gpus(__cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
+  xgboost_list_unique(__cuda_arch_bin __cuda_arch_ptx)
+
+  set(__nvcc_flags "")
+  set(__nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(__arch ${__cuda_arch_bin})
+    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
+      list(APPEND __nvcc_archs_readable sm_${__arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(__arch ${__cuda_arch_ptx})
+    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
+    list(APPEND __nvcc_archs_readable compute_${__arch})
+  endforeach()
+
+  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
+  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Short command for cuda comnpilation
+# Usage:
+#   xgboost_cuda_compile(<objlist_variable> <cuda_files>)
+macro(xgboost_cuda_compile objlist_variable)
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var}_backup_in_cuda_compile_ "${${var}}")
+
+    # we remove /EHa as it generates warnings under windows
+    string(REPLACE "/EHa" "" ${var} "${${var}}")
+
+  endforeach()
+  if(UNIX OR APPLE)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
+  endif()
+
+  if(APPLE)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
+  endif()
+
+  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G -lineinfo")
+
+  if(MSVC)
+    # disable noisy warnings:
+    # 4819: The file contains a character that cannot be represented in the current code page (number).
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
+
+  # If the build system is a container, make sure the nvcc intermediate files
+  # go into the build output area rather than in /tmp, which may run out of space
+  if(IS_CONTAINER_BUILD)
+    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
+    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
+  endif()
+
+  cuda_compile(cuda_objcs ${ARGN})
+
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var} "${${var}_backup_in_cuda_compile_}")
+    unset(${var}_backup_in_cuda_compile_)
+  endforeach()
+
+  set(${objlist_variable} ${cuda_objcs})
+endmacro()
+
+
+################################################################################################
+###  Non macro section
+################################################################################################
+
+# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
+if(NOT CUDA_TOOLKIT_ROOT_DIR)
+  find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
+  if(CUDA_LIBRARY_PATH)
+    get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
+    set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
+  endif()
+endif()
+
+find_package(CUDA 5.5 QUIET REQUIRED)
+find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
+
+if(NOT CUDA_FOUND)
+  return()
+endif()
+
+set(HAVE_CUDA TRUE)
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+list(APPEND xgboost_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
+                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+
+# Known NVIDIA GPU achitectures xgboost can be compiled for.
+# This list will be used for CUDA_ARCH_NAME = All option
+if(CUDA_ARCH_ALL)
+  set(xgboost_known_gpu_archs "${CUDA_ARCH_ALL}")
+else()
+  if(${CUDA_VERSION} GREATER 7.5)
+    set(xgboost_known_gpu_archs "30 35 50 52 60 61")
+  else()
+    set(xgboost_known_gpu_archs "30 35 50 52")
+  endif()
+endif()
+
+# cudnn detection
+if(USE_CUDNN)
+  detect_cuDNN()
+  if(HAVE_CUDNN)
+    add_definitions(-DUSE_CUDNN)
+    include_directories(SYSTEM ${CUDNN_INCLUDE})
+    list(APPEND xgboost_LINKER_LIBS ${CUDNN_LIBRARY})
+  endif()
+endif()
+
+# setting nvcc arch flags
+xgboost_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
+# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
+if(Boost_VERSION EQUAL 105500)
+  message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
+  # avoid warning for CMake >= 2.8.12
+  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
+endif()
+
+# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
+foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
+  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
+endforeach()
+
+# setting default testing device
+if(NOT CUDA_TEST_DEVICE)
+  set(CUDA_TEST_DEVICE -1)
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
+
+# Handle clang/libc++ issue
+if(APPLE)
+  xgboost_detect_darwin_version(OSX_VERSION)
+
+  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
+  if(OSX_VERSION VERSION_GREATER 10.8)
+    # enabled by default if and only if CUDA version is less than 7.0
+    xgboost_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
+  endif()
+endif()
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -0,0 +1,398 @@
+################################################################################################
+# Command alias for debugging messages
+# Usage:
+#   dmsg(<message>)
+function(dmsg)
+  message(STATUS ${ARGN})
+endfunction()
+
+################################################################################################
+# Removes duplicates from list(s)
+# Usage:
+#   xgboost_list_unique(<list_variable> [<list_variable>] [...])
+macro(xgboost_list_unique)
+  foreach(__lst ${ARGN})
+    if(${__lst})
+      list(REMOVE_DUPLICATES ${__lst})
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Clears variables from list
+# Usage:
+#   xgboost_clear_vars(<variables_list>)
+macro(xgboost_clear_vars)
+  foreach(_var ${ARGN})
+    unset(${_var})
+  endforeach()
+endmacro()
+
+################################################################################################
+# Removes duplicates from string
+# Usage:
+#   xgboost_string_unique(<string_variable>)
+function(xgboost_string_unique __string)
+  if(${__string})
+    set(__list ${${__string}})
+    separate_arguments(__list)
+    list(REMOVE_DUPLICATES __list)
+    foreach(__e ${__list})
+      set(__str "${__str} ${__e}")
+    endforeach()
+    set(${__string} ${__str} PARENT_SCOPE)
+  endif()
+endfunction()
+
+################################################################################################
+# Prints list element per line
+# Usage:
+#   xgboost_print_list(<list>)
+function(xgboost_print_list)
+  foreach(e ${ARGN})
+    message(STATUS ${e})
+  endforeach()
+endfunction()
+
+################################################################################################
+# Function merging lists of compiler flags to single string.
+# Usage:
+#   xgboost_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
+function(xgboost_merge_flag_lists out_var)
+  set(__result "")
+  foreach(__list ${ARGN})
+    foreach(__flag ${${__list}})
+      string(STRIP ${__flag} __flag)
+      set(__result "${__result} ${__flag}")
+    endforeach()
+  endforeach()
+  string(STRIP ${__result} __result)
+  set(${out_var} ${__result} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Converts all paths in list to absolute
+# Usage:
+#   xgboost_convert_absolute_paths(<list_variable>)
+function(xgboost_convert_absolute_paths variable)
+  set(__dlist "")
+  foreach(__s ${${variable}})
+    get_filename_component(__abspath ${__s} ABSOLUTE)
+    list(APPEND __list ${__abspath})
+  endforeach()
+  set(${variable} ${__list} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Reads set of version defines from the header file
+# Usage:
+#   xgboost_parse_header(<file> <define1> <define2> <define3> ..)
+macro(xgboost_parse_header FILENAME FILE_VAR)
+  set(vars_regex "")
+  set(__parnet_scope OFF)
+  set(__add_cache OFF)
+  foreach(name ${ARGN})
+    if("${name}" STREQUAL "PARENT_SCOPE")
+      set(__parnet_scope ON)
+    elseif("${name}" STREQUAL "CACHE")
+      set(__add_cache ON)
+    elseif(vars_regex)
+      set(vars_regex "${vars_regex}|${name}")
+    else()
+      set(vars_regex "${name}")
+    endif()
+  endforeach()
+  if(EXISTS "${FILENAME}")
+    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
+  else()
+    unset(${FILE_VAR})
+  endif()
+  foreach(name ${ARGN})
+    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
+      if(${FILE_VAR})
+        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
+          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
+        else()
+          set(${name} "")
+        endif()
+        if(__add_cache)
+          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
+        elseif(__parnet_scope)
+          set(${name} "${${name}}" PARENT_SCOPE)
+        endif()
+      else()
+        unset(${name} CACHE)
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Reads single version define from the header file and parses it
+# Usage:
+#   xgboost_parse_header_single_define(<library_name> <file> <define_name>)
+function(xgboost_parse_header_single_define LIBNAME HDR_PATH VARNAME)
+  set(${LIBNAME}_H "")
+  if(EXISTS "${HDR_PATH}")
+    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
+  endif()
+
+  if(${LIBNAME}_H)
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
+    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+
+    # append a TWEAK version if it exists:
+    set(${LIBNAME}_VERSION_TWEAK "")
+    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
+      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
+    endif()
+    if(${LIBNAME}_VERSION_TWEAK)
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
+    else()
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
+########################################################################################################
+# An option that the user can select. Can accept condition to control when option is available for user.
+# Usage:
+#   xgboost_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
+function(xgboost_option variable description value)
+  set(__value ${value})
+  set(__condition "")
+  set(__varname "__value")
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "IF" OR arg STREQUAL "if")
+      set(__varname "__condition")
+    else()
+      list(APPEND ${__varname} ${arg})
+    endif()
+  endforeach()
+  unset(__varname)
+  if("${__condition}" STREQUAL "")
+    set(__condition 2 GREATER 1)
+  endif()
+
+  if(${__condition})
+    if("${__value}" MATCHES ";")
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    elseif(DEFINED ${__value})
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    else()
+      option(${variable} "${description}" ${__value})
+    endif()
+  else()
+    unset(${variable} CACHE)
+  endif()
+endfunction()
+
+################################################################################################
+# Utility macro for comparing two lists. Used for CMake debugging purposes
+# Usage:
+#   xgboost_compare_lists(<list_variable> <list2_variable> [description])
+function(xgboost_compare_lists list1 list2 desc)
+  set(__list1 ${${list1}})
+  set(__list2 ${${list2}})
+  list(SORT __list1)
+  list(SORT __list2)
+  list(LENGTH __list1 __len1)
+  list(LENGTH __list2 __len2)
+
+  if(NOT ${__len1} EQUAL ${__len2})
+    message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
+  endif()
+
+  foreach(__i RANGE 1 ${__len1})
+    math(EXPR __index "${__i}- 1")
+    list(GET __list1 ${__index} __item1)
+    list(GET __list2 ${__index} __item2)
+    if(NOT ${__item1} STREQUAL ${__item2})
+      message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
+    endif()
+  endforeach()
+endfunction()
+
+################################################################################################
+# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
+# Usage:
+#   xgboost_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
+macro(xgboost_warnings_disable)
+  set(_flag_vars "")
+  set(_msvc_warnings "")
+  set(_gxx_warnings "")
+
+  foreach(arg ${ARGN})
+    if(arg MATCHES "^CMAKE_")
+      list(APPEND _flag_vars ${arg})
+    elseif(arg MATCHES "^/wd")
+      list(APPEND _msvc_warnings ${arg})
+    elseif(arg MATCHES "^-W")
+      list(APPEND _gxx_warnings ${arg})
+    endif()
+  endforeach()
+
+  if(NOT _flag_vars)
+    set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  if(MSVC AND _msvc_warnings)
+    foreach(var ${_flag_vars})
+      foreach(warning ${_msvc_warnings})
+        set(${var} "${${var}} ${warning}")
+      endforeach()
+    endforeach()
+  elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
+    foreach(var ${_flag_vars})
+      foreach(warning ${_gxx_warnings})
+        if(NOT warning MATCHES "^-Wno-")
+          string(REPLACE "${warning}" "" ${var} "${${var}}")
+          string(REPLACE "-W" "-Wno-" warning "${warning}")
+        endif()
+        set(${var} "${${var}} ${warning}")
+      endforeach()
+    endforeach()
+  endif()
+  xgboost_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
+endmacro()
+
+################################################################################################
+# Helper function get current definitions
+# Usage:
+#   xgboost_get_current_definitions(<definitions_variable>)
+function(xgboost_get_current_definitions definitions_var)
+  get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
+  set(result "")
+
+  foreach(d ${current_definitions})
+    list(APPEND result -D${d})
+  endforeach()
+
+  xgboost_list_unique(result)
+  set(${definitions_var} ${result} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function get current includes/definitions
+# Usage:
+#   xgboost_get_current_cflags(<cflagslist_variable>)
+function(xgboost_get_current_cflags cflags_var)
+  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
+  xgboost_convert_absolute_paths(current_includes)
+  xgboost_get_current_definitions(cflags)
+
+  foreach(i ${current_includes})
+    list(APPEND cflags "-I${i}")
+  endforeach()
+
+  xgboost_list_unique(cflags)
+  set(${cflags_var} ${cflags} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function to parse current linker libs into link directories, libflags and osx frameworks
+# Usage:
+#   xgboost_parse_linker_libs(<xgboost_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
+function(xgboost_parse_linker_libs xgboost_LINKER_LIBS_variable folders_var flags_var frameworks_var)
+
+  set(__unspec "")
+  set(__debug "")
+  set(__optimized "")
+  set(__framework "")
+  set(__varname "__unspec")
+
+  # split libs into debug, optimized, unspecified and frameworks
+  foreach(list_elem ${${xgboost_LINKER_LIBS_variable}})
+    if(list_elem STREQUAL "debug")
+      set(__varname "__debug")
+    elseif(list_elem STREQUAL "optimized")
+      set(__varname "__optimized")
+    elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
+      list(APPEND __framework -framework ${CMAKE_MATCH_1})
+    else()
+      list(APPEND ${__varname} ${list_elem})
+      set(__varname "__unspec")
+    endif()
+  endforeach()
+
+  # attach debug or optimized libs to unspecified according to current configuration
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(__libs ${__unspec} ${__debug})
+  else()
+    set(__libs ${__unspec} ${__optimized})
+  endif()
+
+  set(libflags "")
+  set(folders "")
+
+  # convert linker libraries list to link flags
+  foreach(lib ${__libs})
+    if(TARGET ${lib})
+      list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
+      list(APPEND libflags -l${lib})
+    elseif(lib MATCHES "^-l.*")
+      list(APPEND libflags ${lib})
+    elseif(IS_ABSOLUTE ${lib})
+      get_filename_component(name_we ${lib} NAME_WE)
+      get_filename_component(folder  ${lib} PATH)
+
+      string(REGEX MATCH "^lib(.*)" __match ${name_we})
+      list(APPEND libflags -l${CMAKE_MATCH_1})
+      list(APPEND folders    ${folder})
+    else()
+      message(FATAL_ERROR "Logic error. Need to update cmake script")
+    endif()
+  endforeach()
+
+  xgboost_list_unique(libflags folders)
+
+  set(${folders_var} ${folders} PARENT_SCOPE)
+  set(${flags_var} ${libflags} PARENT_SCOPE)
+  set(${frameworks_var} ${__framework} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
+# Usage:
+#   xgboost_detect_darwin_version(<version_variable>)
+function(xgboost_detect_darwin_version output_var)
+  if(APPLE)
+    execute_process(COMMAND /usr/bin/sw_vers -productVersion
+                    RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
+  else()
+    set(${output_var} "" PARENT_SCOPE)
+  endif()
+endfunction()
+
+################################################################################################
+# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
+# Usage:
+#   caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
+function(xgboost_source_group group)
+  cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
+  if(CAFFE_SOURCE_GROUP_GLOB)
+    file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
+    source_group(${group} FILES ${srcs1})
+  endif()
+
+  if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
+    file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
+    source_group(${group} FILES ${srcs2})
+  endif()
+endfunction()
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@ -24,7 +24,11 @@
 // helper functions
 // set handle
 void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
-  jlong out = (jlong) handle;
+#ifdef __APPLE__
+  jlong out = (long) handle;
+#else
+  int64_t out = (int64_t) handle;
+#endif
  jenv->SetLongArrayRegion(jhandle, 0, 1, &out);
 }

@ -32,7 +36,7 @@ void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
 static JavaVM* global_jvm = nullptr;

 // overrides JNI on load
-JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
+jint JNI_OnLoad(JavaVM *vm, void *reserved) {
  global_jvm = vm;
  return JNI_VERSION_1_6;
 }
@ -72,7 +76,7 @@ XGB_EXTERN_C int XGBoost4jCallbackDataIterNext(
        batch, jenv->GetFieldID(batchClass, "featureValue", "[F"));
      XGBoostBatchCSR cbatch;
      cbatch.size = jenv->GetArrayLength(joffset) - 1;
-      cbatch.offset = reinterpret_cast<jlong *>(
+      cbatch.offset = reinterpret_cast<long *>(
          jenv->GetLongArrayElements(joffset, 0));
      if (jlabel != nullptr) {
        cbatch.label = jenv->GetFloatArrayElements(jlabel, 0);
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 93183bca921b2e8e1754e27e1b43d73cf6caec9d
--- a/plugin/updater_gpu/README.md
+++ b/plugin/updater_gpu/README.md
@ -17,8 +17,11 @@ colsample_bytree | &#10004; | &#10004;|
 colsample_bylevel | &#10004; | &#10004; |
 max_bin | &#10006; | &#10004; |
 gpu_id | &#10004; | &#10004; | 
+n_gpus | &#10006; | &#10004; | 

-All algorithms currently use only a single GPU. The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
+The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
+
+Multiple GPUs can be used with the grow_gpu_hist parameter using the n_gpus parameter, which defaults to -1 (indicating use all visible GPUs).  If gpu_id is specified as non-zero, the gpu device order is mod(gpu_id + i) % n_visible_devices for i=0 to n_gpus-1.  As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance.  For example, when n_features * n_bins * 2^depth divided by time of each round/iteration becomes comparable to the real PCI 16x bus bandwidth of order 4GB/s to 10GB/s, then AllReduce will dominant code speed and multiple GPUs become ineffective at increasing performance.  Also, CPU overhead between GPU calls can limit usefulness of multiple GPUs.

 This plugin currently works with the CLI version and python version.

@ -54,29 +57,38 @@ $ python -m nose test/python/
 ## Dependencies
 A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).

-Building the plug-in requires CUDA Toolkit 7.5 or later.
+Building the plug-in requires CUDA Toolkit 7.5 or later (https://developer.nvidia.com/cuda-downloads)

+submodule: The plugin also depends on CUB 1.6.4 - https://nvlabs.github.io/cub/ . CUB is a header only cuda library which provides sort/reduce/scan primitives.
+
+submodule: NVIDIA NCCL from https://github.com/NVIDIA/nccl with windows port allowed by git@github.com:h2oai/nccl.git

 ## Build

-### Using cmake
-To use the plugin xgboost must be built by specifying the option PLUGIN_UPDATER_GPU=ON. CMake will prepare a build system depending on which platform you are on.
+From the command line on Linux starting from the xgboost directory:

 On Linux, from the xgboost directory:
 ```bash
 $ mkdir build
 $ cd build
 $ cmake .. -DPLUGIN_UPDATER_GPU=ON
-$ make
+$ make -j
 ```
-If 'make' fails try invoking make again. There can sometimes be problems with the order items are built.
-
-On Windows you may also need to specify your generator as 64 bit, so the cmake command becomes:
+On Windows using cmake, see what options for Generators you have for cmake, and choose one with [arch] replaced by Win64:
 ```bash
-$ cmake .. -G"Visual Studio 12 2013 Win64" -DPLUGIN_UPDATER_GPU=ON
+cmake -help
 ```
-You may also  be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.
-cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode. This is also a good time to check it is being built as x64. If not make sure the cmake generator is set correctly.
+Then run cmake as:
+```bash
+$ mkdir build
+$ cd build
+$ cmake .. -G"Visual Studio 14 2015 Win64" -DPLUGIN_UPDATER_GPU=ON
+```
+Cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build.
+
+Visual studio community 2015, supported by cuda toolkit (http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/#axzz4isREr2nS), can be downloaded from: https://my.visualstudio.com/Downloads?q=Visual%20Studio%20Community%202015 .  You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.  Note that Mingw cannot be used with cuda.
+
+### For Developers!

 ### Using make
 Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
@ -84,9 +96,6 @@ Now, it also supports the usual 'make' flow to build gpu-enabled tree constructi
 # make sure CUDA SDK bin directory is in the 'PATH' env variable
 $ make PLUGIN_UPDATER_GPU=ON
 ```
-
-### For Developers!
-
 Now, some of the code-base inside gpu plugins have googletest unit-tests inside 'tests/'.
 They can be enabled run along with other unit-tests inside '<xgboostRoot>/tests/cpp' using:
 ```bash
@ -98,10 +107,17 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
 ```

 ## Changelog
+##### 2017/6/5
+
+* Multi-GPU support for histogram method using NVIDIA NCCL.
+
 ##### 2017/5/31
 * Faster version of the grow_gpu plugin
 * Added support for building gpu plugin through 'make' flow too

+##### 2017/5/19
+* Further performance enhancements for histogram method.
+
 ##### 2017/5/5
 * Histogram performance improvements
 * Fix gcc build issues 
@ -115,10 +131,19 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
 [Mitchell, Rory, and Eibe Frank. Accelerating the XGBoost algorithm using GPU computing. No. e2911v1. PeerJ Preprints, 2017.](https://peerj.com/preprints/2911/)

 ## Author
+<<<<<<< HEAD
+Rory Mitchell,
+Jonathan C. McKinney,
+Shankara Rao Thejaswi Nanditale,
+Vinay Deshpande,
+and the rest of the H2O.ai and NVIDIA team.
+=======
 Rory Mitchell
+Jonathan C. McKinney
+Shankara Rao Thejaswi Nanditale
+Vinay Deshpande
+... and the rest of the H2O.ai and NVIDIA team.
+>>>>>>> d2fbbdf4a39fa1f0af5cbd59a7912cf5caade34e

-Please report bugs to the xgboost/issues page. You can tag me with @RAMitchell.
-
-Otherwise I can be contacted at r.a.mitchell.nz at gmail.
-
+Please report bugs to the xgboost/issues page.

--- a/plugin/updater_gpu/src/common.cuh
+++ b/plugin/updater_gpu/src/common.cuh
@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
 */
 #pragma once
 #include <vector>
@ -147,7 +147,8 @@ inline void dense2sparse_tree(RegTree* p_tree,
 }

 // Set gradient pair to 0 with p = 1 - subsample
-inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
+inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample,
+                            int offset) {
  if (subsample == 1.0) {
    return;
  }
@ -157,13 +158,19 @@ inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
  auto d_gpair = gpair.data();
  dh::BernoulliRng rng(subsample, common::GlobalRandom()());

-  dh::launch_n(gpair.size(), [=] __device__(int i) {
-    if (!rng(i)) {
+  dh::launch_n(gpair.device_idx(), gpair.size(), [=] __device__(int i) {
+    if (!rng(i + offset)) {
      d_gpair[i] = gpu_gpair();
    }
  });
 }

+// Set gradient pair to 0 with p = 1 - subsample
+inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
+  int offset = 0;
+  subsample_gpair(p_gpair, subsample, offset);
+}
+
 inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
  int n = colsample * features.size();
  CHECK_GT(n, 0);
@ -233,8 +240,8 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
 * @param def default value to be filled
 */
 template <typename T, int BlkDim=256, int ItemsPerThread=4>
-void fillConst(T* out, int len, T def) {
-  dh::launch_n<ItemsPerThread,BlkDim>(len, [=] __device__(int i) { out[i] = def; });
+void fillConst(int device_idx, T* out, int len, T def) {
+  dh::launch_n<ItemsPerThread,BlkDim>(device_idx, len, [=] __device__(int i) { out[i] = def; });
 }

 /**
@ -247,10 +254,10 @@ void fillConst(T* out, int len, T def) {
 * @param nVals length of the buffers
 */
 template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
-void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
+void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
            int nVals) {
  dh::launch_n<ItemsPerThread,BlkDim>
-      (nVals, [=] __device__(int i) {
+    (device_idx, nVals, [=] __device__(int i) {
                  int iid = instId[i];
                  T1 v1 = in1[iid];
                  T2 v2 = in2[iid];
@ -267,9 +274,9 @@ void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
 * @param nVals length of the buffers
 */
 template <typename T, int BlkDim=256, int ItemsPerThread=4>
-void gather(T* out, const T* in, const int* instId, int nVals) {
+void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
  dh::launch_n<ItemsPerThread,BlkDim>
-      (nVals, [=] __device__(int i) {
+    (device_idx, nVals, [=] __device__(int i) {
                  int iid = instId[i];
                  out[i] = in[iid];
              });
--- a/plugin/updater_gpu/src/device_helpers.cuh
+++ b/plugin/updater_gpu/src/device_helpers.cuh
@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
 */
 #pragma once
 #include <thrust/device_vector.h>
@ -12,11 +12,20 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <numeric>
 #include <cub/cub.cuh>

+#ifndef NCCL
+#define NCCL 1
+#endif
+
+#if (NCCL)
+#include "nccl.h"
+#endif
+
 // Uncomment to enable
 // #define DEVICE_TIMER
-// #define TIMERS
+#define TIMERS

 namespace dh {

@ -42,6 +51,22 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
  return code;
 }

+#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
+
+#if (NCCL)
+inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
+                                        int line) {
+  if (code != ncclSuccess) {
+    std::stringstream ss;
+    ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
+    ss << file << "(" << line << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  return code;
+}
+#endif
+
 #define gpuErrchk(ans) \
  { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line,
@ -53,6 +78,55 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
  }
 }

+inline int n_visible_devices() {
+  int n_visgpus = 0;
+
+  cudaGetDeviceCount(&n_visgpus);
+
+  return n_visgpus;
+}
+
+inline int n_devices_all(int n_gpus) {
+  if (NCCL == 0 && n_gpus > 1 || NCCL == 0 && n_gpus != 0) {
+    if (n_gpus != 1 && n_gpus != 0) {
+      fprintf(stderr, "NCCL=0, so forcing n_gpus=1\n");
+      fflush(stderr);
+    }
+    n_gpus = 1;
+  }
+  int n_devices_visible = dh::n_visible_devices();
+  int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
+  return (n_devices);
+}
+inline int n_devices(int n_gpus, int num_rows) {
+  int n_devices = dh::n_devices_all(n_gpus);
+  // fix-up device number to be limited by number of rows
+  n_devices = n_devices > num_rows ? num_rows : n_devices;
+  return (n_devices);
+}
+
+// if n_devices=-1, then use all visible devices
+inline void synchronize_n_devices(int n_devices, std::vector<int> dList) {
+  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
+    int device_idx = dList[d_idx];
+    safe_cuda(cudaSetDevice(device_idx));
+    safe_cuda(cudaDeviceSynchronize());
+  }
+}
+inline void synchronize_all() {
+  for (int device_idx = 0; device_idx < n_visible_devices(); device_idx++) {
+    safe_cuda(cudaSetDevice(device_idx));
+    safe_cuda(cudaDeviceSynchronize());
+  }
+}
+
+inline std::string device_name(int device_idx) {
+  cudaDeviceProp prop;
+  dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
+  return std::string(prop.name);
+}
+
+
 /*
 *  Timers
 */
@ -119,7 +193,9 @@ struct DeviceTimer {

 #ifdef DEVICE_TIMER
  __device__ DeviceTimer(DeviceTimerGlobal &GTimer, int slot)  // NOLINT
-      : GTimer(GTimer), start(clock()), slot(slot) {}
+      : GTimer(GTimer),
+        start(clock()),
+        slot(slot) {}
 #else
  __device__ DeviceTimer(DeviceTimerGlobal &GTimer, int slot) {}  // NOLINT
 #endif
@ -146,8 +222,8 @@ struct Timer {
  void reset() { start = ClockT::now(); }
  int64_t elapsed() const { return (ClockT::now() - start).count(); }
  void printElapsed(std::string label) {
-    safe_cuda(cudaDeviceSynchronize());
-    printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
+    //    synchronize_n_devices(n_devices, dList);
+    printf("%s:\t %lld\n", label.c_str(), elapsed());
    reset();
  }
 };
@ -229,43 +305,47 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
 * Memory
 */

+enum memory_type { DEVICE, DEVICE_MANAGED };
+
+template <memory_type MemoryT>
 class bulk_allocator;
 template <typename T> class dvec2;

 template <typename T>
 class dvec {
-  friend bulk_allocator;
 friend class dvec2<T>;
-
 private:
  T *_ptr;
  size_t _size;
+  int _device_idx;

-  void external_allocate(void *ptr, size_t size) {
+ public:
+  void external_allocate(int device_idx, void *ptr, size_t size) {
    if (!empty()) {
      throw std::runtime_error("Tried to allocate dvec but already allocated");
    }
    _ptr = static_cast<T *>(ptr);
    _size = size;
+    _device_idx = device_idx;
  }

- public:
-  dvec() : _ptr(NULL), _size(0) {}
-
+  dvec() : _ptr(NULL), _size(0), _device_idx(0) {}
  size_t size() const { return _size; }
-
+  int device_idx() const { return _device_idx; }
  bool empty() const { return _ptr == NULL || _size == 0; }

  T *data() { return _ptr; }

  std::vector<T> as_vector() const {
    std::vector<T> h_vector(size());
+    safe_cuda(cudaSetDevice(_device_idx));
    safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
                         cudaMemcpyDeviceToHost));
    return h_vector;
  }

  void fill(T value) {
+    safe_cuda(cudaSetDevice(_device_idx));
    thrust::fill_n(thrust::device_pointer_cast(_ptr), size(), value);
  }

@ -285,11 +365,7 @@ class dvec {

  template <typename T2>
  dvec &operator=(const std::vector<T2> &other) {
-    if (other.size() != size()) {
-      throw std::runtime_error(
-          "Cannot copy assign vector to dvec, sizes are different");
-    }
-    thrust::copy(other.begin(), other.end(), this->tbegin());
+    this->copy(other.begin(), other.end());
    return *this;
  }

@ -298,9 +374,25 @@ class dvec {
      throw std::runtime_error(
          "Cannot copy assign dvec to dvec, sizes are different");
    }
+    safe_cuda(cudaSetDevice(this->device_idx()));
+    if (other.device_idx() == this->device_idx()) {
      thrust::copy(other.tbegin(), other.tend(), this->tbegin());
+    } else {
+      throw std::runtime_error("Cannot copy to/from different devices");
+    }
+
    return *this;
  }
+
+  template <typename IterT>
+  void copy(IterT begin, IterT end) {
+    safe_cuda(cudaSetDevice(this->device_idx()));
+    if (end - begin != size()) {
+      throw std::runtime_error(
+          "Cannot copy assign vector to dvec, sizes are different");
+    }
+    thrust::copy(begin, end, this->tbegin());
+  }
 };

 /**
@ -309,34 +401,34 @@ class dvec {
 */
 template <typename T>
 class dvec2 {
-  friend bulk_allocator;

 private:
  dvec<T> _d1, _d2;
  cub::DoubleBuffer<T> _buff;
+  int _device_idx;

-  void external_allocate(void *ptr1, void *ptr2, size_t size) {
+
+ public:
+  void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
    if (!empty()) {
      throw std::runtime_error("Tried to allocate dvec2 but already allocated");
    }
-    _d1.external_allocate(ptr1, size);
-    _d2.external_allocate(ptr2, size);
+    _d1.external_allocate(_device_idx, ptr1, size);
+    _d2.external_allocate(_device_idx, ptr2, size);
    _buff.d_buffers[0] = static_cast<T *>(ptr1);
    _buff.d_buffers[1] = static_cast<T *>(ptr2);
    _buff.selector = 0;
+    _device_idx = device_idx;
  }
-
- public:
-  dvec2() : _d1(), _d2(), _buff() {}
+  dvec2() : _d1(), _d2(), _buff(), _device_idx(0) {}

  size_t size() const { return _d1.size(); }
-
+  int device_idx() const { return _device_idx; }
  bool empty() const { return _d1.empty() || _d2.empty(); }

  cub::DoubleBuffer<T> &buff() { return _buff; }

  dvec<T> &d1() { return _d1; }
-
  dvec<T> &d2() { return _d2; }

  T *current() { return _buff.Current(); }
@ -346,9 +438,11 @@ class dvec2 {
  T *other() { return _buff.Alternate(); }
 };

+template <memory_type MemoryT>
 class bulk_allocator {
-  char *d_ptr;
-  size_t _size;
+  std::vector<char *> d_ptr;
+  std::vector<size_t> _size;
+  std::vector<int> _device_idx;

  const int align = 256;

@ -369,18 +463,32 @@ class bulk_allocator {
  }

  template <typename T, typename SizeT>
-  void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size) {
-    first_vec->external_allocate(static_cast<void *>(ptr), first_size);
+  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
+                     SizeT first_size) {
+    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
+                                 first_size);
  }

  template <typename T, typename SizeT, typename... Args>
-  void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
-                     Args... args) {
-    allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
+  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
+                     SizeT first_size, Args... args) {
+    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
+                                 first_size);
    ptr += align_round_up(first_size * sizeof(T));
-    allocate_dvec(ptr, args...);
+    allocate_dvec(device_idx, ptr, args...);
  }

+  //    template <memory_type MemoryT>
+  char *allocate_device(int device_idx, size_t bytes, memory_type t) {
+    char *ptr;
+    if (t == memory_type::DEVICE) {
+      safe_cuda(cudaSetDevice(device_idx));
+      safe_cuda(cudaMalloc(&ptr, bytes));
+    } else {
+      safe_cuda(cudaMallocManaged(&ptr, bytes));
+    }
+    return ptr;
+  }
  template <typename T, typename SizeT>
  size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
    return 2 * align_round_up(first_size * sizeof(T));
@ -392,40 +500,46 @@ class bulk_allocator {
  }

  template <typename T, typename SizeT>
-  void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
-    first_vec->external_allocate
-        (static_cast<void *>(ptr),
+  void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size) {
+    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
         static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
         first_size);
  }

  template <typename T, typename SizeT, typename... Args>
-  void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
+  void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size,
                     Args... args) {
-    allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
+    allocate_dvec<T,SizeT>(device_idx, ptr, first_vec, first_size);
    ptr += (align_round_up(first_size * sizeof(T)) * 2);
-    allocate_dvec(ptr, args...);
+    allocate_dvec(device_idx, ptr, args...);
  }

 public:
-  bulk_allocator() : _size(0), d_ptr(NULL) {}
-
  ~bulk_allocator() {
-    if (!(d_ptr == nullptr)) {
-      safe_cuda(cudaFree(d_ptr));
+    for (int i = 0; i < d_ptr.size(); i++) {
+      if (!(d_ptr[i] == nullptr)) {
+        safe_cuda(cudaSetDevice(_device_idx[i]));
+        safe_cuda(cudaFree(d_ptr[i]));
+      }
    }
  }

-  size_t size() { return _size; }
+  // returns sum of bytes for all allocations
+  size_t size() {
+    return std::accumulate(_size.begin(), _size.end(), static_cast<size_t>(0));
+  }

  template <typename... Args>
-  void allocate(Args... args) {
-    if (d_ptr != NULL) {
-      throw std::runtime_error("Bulk allocator already allocated");
-    }
-    _size = get_size_bytes(args...);
-    safe_cuda(cudaMalloc(&d_ptr, _size));
-    allocate_dvec(d_ptr, args...);
+  void allocate(int device_idx, Args... args) {
+    size_t size = get_size_bytes(args...);
+
+    char *ptr = allocate_device(device_idx, size, MemoryT);
+
+    allocate_dvec(device_idx, ptr, args...);
+
+    d_ptr.push_back(ptr);
+    _size.push_back(size);
+    _device_idx.push_back(device_idx);
  }
 };

@ -455,19 +569,14 @@ struct CubMemory {
  bool IsAllocated() { return d_temp_storage != NULL; }
 };

-inline size_t available_memory() {
+inline size_t available_memory(int device_idx) {
  size_t device_free = 0;
  size_t device_total = 0;
-  safe_cuda(cudaMemGetInfo(&device_free, &device_total));
+  safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
  return device_free;
 }

-inline std::string device_name() {
-  cudaDeviceProp prop;
-  safe_cuda(cudaGetDeviceProperties(&prop, 0));
-  return std::string(prop.name);
-}
-
 /*
 *  Utility functions
 */
@ -481,7 +590,7 @@ void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
  std::cout << "\n";
 }

-template <typename T>
+template <typename T, memory_type MemoryT>
 void print(const dvec<T> &v, size_t max_items = 10) {
  std::vector<T> h = v.as_vector();
  for (int i = 0; i < std::min(max_items, h.size()); i++) {
@ -530,17 +639,46 @@ size_t size_bytes(const thrust::device_vector<T> &v) {
 */

 template <typename L>
-__global__ void launch_n_kernel(size_t n, L lambda) {
-  for (auto i : grid_stride_range(static_cast<size_t>(0), n)) {
+__global__ void launch_n_kernel(size_t begin, size_t end, L lambda) {
+  for (auto i : grid_stride_range(begin, end)) {
    lambda(i);
  }
 }
+template <typename L>
+__global__ void launch_n_kernel(int device_idx, size_t begin, size_t end,
+                                L lambda) {
+  for (auto i : grid_stride_range(begin, end)) {
+    lambda(i, device_idx);
+  }
+}

 template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
-inline void launch_n(size_t n, L lambda) {
+inline void launch_n(int device_idx, size_t n, L lambda) {
+  safe_cuda(cudaSetDevice(device_idx));
  const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
 #if defined(__CUDACC__)
-  launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
+  launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
+                                                lambda);
+#endif
+}
+
+// if n_devices=-1, then use all visible devices
+template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
+inline void multi_launch_n(size_t n, int n_devices, L lambda) {
+  n_devices = n_devices < 0 ? n_visible_devices() : n_devices;
+  CHECK_LE(n_devices, n_visible_devices()) << "Number of devices requested "
+                                              "needs to be less than equal to "
+                                              "number of visible devices.";
+  const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
+#if defined(__CUDACC__)
+  n_devices = n_devices > n ? n : n_devices;
+  for (int device_idx = 0; device_idx < n_devices; device_idx++) {
+    safe_cuda(cudaSetDevice(device_idx));
+    size_t begin = (n / n_devices) * device_idx;
+    size_t end = std::min((n / n_devices) * (device_idx + 1), n);
+    launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(device_idx, begin, end,
+                                                  lambda);
+  }
 #endif
 }

--- a/plugin/updater_gpu/src/exact/argmax_by_key.cuh
+++ b/plugin/updater_gpu/src/exact/argmax_by_key.cuh
@ -168,7 +168,7 @@ void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
                 const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
                 node_id_t nodeStart, int len, const TrainParam param,
                 ArgMaxByKeyAlgo algo) {
-  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
+  fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
  int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
  switch(algo) {
  case ABK_GMEM:
--- a/plugin/updater_gpu/src/exact/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/exact/gpu_builder.cuh
@ -208,7 +208,7 @@ private:
  dh::dvec<gpu_gpair> tmpScanGradBuff;
  dh::dvec<int> tmpScanKeyBuff;
  dh::dvec<int> colIds;
-  dh::bulk_allocator ba;
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;

  void findSplit(int level, node_id_t nodeStart, int nNodes) {
    reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
@ -226,7 +226,8 @@ private:

  void allocateAllData(int offsetSize) {
    int tmpBuffSize = scanTempBufferSize(nVals);
-    ba.allocate(&vals, nVals,
+    ba.allocate(param.gpu_id,
+                &vals, nVals,
                &vals_cached, nVals,
                &instIds, nVals,
                &instIds_cached, nVals,
@ -245,7 +246,7 @@ private:
  }

  void setupOneTimeData(DMatrix& hMat) {
-    size_t free_memory = dh::available_memory();
+    size_t free_memory = dh::available_memory(param.gpu_id);
    if (!hMat.SingleColBlock()) {
      throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
    }
@ -258,7 +259,7 @@ private:
    if (!param.silent) {
      const int mb_size = 1048576;
      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
-                   << free_memory / mb_size << " MB on " << dh::device_name();
+                   << free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
    }
  }

@ -340,7 +341,7 @@ private:
                                      colOffsets.data(), vals.current(),
                                      nVals, nCols);
      // gather the node assignments across all other columns too
-      gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
+      gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
                        instIds.current(), nVals);
      sortKeys(level);
    }
@ -351,7 +352,7 @@ private:
    // but we don't need more than level+1 bits for sorting!
    segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
                  0, level+1);
-    gather<float,int>(vals.other(), vals.current(), instIds.other(),
+    gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
                      instIds.current(), nodeLocations.current(), nVals);
    vals.buff().selector ^= 1;
    instIds.buff().selector ^= 1;
--- a/plugin/updater_gpu/src/functions.cuh
+++ b/plugin/updater_gpu/src/functions.cuh
@ -2,14 +2,10 @@
 * Copyright 2016 Rory mitchell
 */
 #pragma once
-#include "types.cuh"
-#include "../../../src/tree/param.h"
 #include "../../../src/common/random.h"
-
+#include "../../../src/tree/param.h"
+#include "types.cuh"

 namespace xgboost {
-namespace tree {
-
-
-}  // namespace tree
+namespace tree {}  // namespace tree
 }  // namespace xgboost
--- a/plugin/updater_gpu/src/gpu_data.cuh
+++ b/plugin/updater_gpu/src/gpu_data.cuh
@ -21,7 +21,8 @@ struct GPUData {
  int n_features;
  int n_instances;

-  dh::bulk_allocator ba;
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
+  // dh::bulk_allocator<int> ba;
  GPUTrainingParam param;

  dh::dvec<float> fvalues;
@ -72,24 +73,25 @@ struct GPUData {
        n_features, foffsets.data(), foffsets.data() + 1);

    // Allocate memory
-    size_t free_memory = dh::available_memory();
-    ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
-                &fvalues_cached, in_fvalues.size(), &foffsets,
+    size_t free_memory = dh::available_memory(param_in.gpu_id);
+    ba.allocate(param_in.gpu_id,
+                &fvalues, in_fvalues.size(), &fvalues_temp,
+        in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets,
        in_foffsets.size(), &instance_id, in_instance_id.size(),
        &instance_id_temp, in_instance_id.size(), &instance_id_cached,
-                in_instance_id.size(), &feature_id, in_feature_id.size(),
-                &node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
-                &node_id_instance, n_instances, &gpair, n_instances, &nodes,
-                max_nodes, &split_candidates, max_nodes_level * n_features,
-                &node_sums, max_nodes_level * n_features, &node_offsets,
-                max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
-                &sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size,
-                &feature_flags, n_features, &feature_set, n_features);
+        in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
+        in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
+        n_instances, &gpair, n_instances, &nodes, max_nodes, &split_candidates,
+        max_nodes_level * n_features, &node_sums, max_nodes_level * n_features,
+        &node_offsets, max_nodes_level * n_features, &sort_index_in,
+        in_fvalues.size(), &sort_index_out, in_fvalues.size(), &cub_mem,
+        cub_mem_size, &feature_flags, n_features, &feature_set, n_features);

    if (!param_in.silent) {
      const int mb_size = 1048576;
      LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
-                   << free_memory / mb_size << " MB on " << dh::device_name();
+                   << free_memory / mb_size << " MB on "
+                   << dh::device_name(param_in.gpu_id);
    }

    fvalues_cached = in_fvalues;
@ -134,7 +136,8 @@ struct GPUData {
    auto d_node_id_instance = node_id_instance.data();
    auto d_instance_id = instance_id.data();

-    dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
+    dh::launch_n(node_id.device_idx(), fvalues.size(),
+                 [=] __device__(bst_uint i) {
                   d_node_id[i] = d_node_id_instance[d_instance_id[i]];
                 });
  }
--- a/plugin/updater_gpu/src/gpu_hist_builder.cu
+++ b/plugin/updater_gpu/src/gpu_hist_builder.cu
--- a/plugin/updater_gpu/src/gpu_hist_builder.cuh
+++ b/plugin/updater_gpu/src/gpu_hist_builder.cuh
@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
 */
 #pragma once
 #include <thrust/device_vector.h>
@ -11,6 +11,14 @@
 #include "device_helpers.cuh"
 #include "types.cuh"

+#ifndef NCCL
+#define NCCL 1
+#endif
+
+#if (NCCL)
+#include "nccl.h"
+#endif
+
 namespace xgboost {

 namespace tree {
@ -18,7 +26,8 @@ namespace tree {
 struct DeviceGMat {
  dh::dvec<int> gidx;
  dh::dvec<int> ridx;
-  void Init(const common::GHistIndexMatrix &gmat);
+  void Init(int device_idx, const common::GHistIndexMatrix &gmat,
+            bst_uint begin, bst_uint end);
 };

 struct HistBuilder {
@ -31,11 +40,11 @@ struct HistBuilder {

 struct DeviceHist {
  int n_bins;
-  dh::dvec<gpu_gpair> hist;
+  dh::dvec<gpu_gpair> data;

  void Init(int max_depth);

-  void Reset();
+  void Reset(int device_idx);

  HistBuilder GetBuilder();

@ -64,7 +73,9 @@ class GPUHistBuilder {
  void FindSplit(int depth);
  template <int BLOCK_THREADS>
  void FindSplitSpecialize(int depth);
-  void InitFirstNode();
+  template <int BLOCK_THREADS>
+  void LaunchFindSplit(int depth);
+  void InitFirstNode(const std::vector<bst_gpair> &gpair);
  void UpdatePosition(int depth);
  void UpdatePositionDense(int depth);
  void UpdatePositionSparse(int depth);
@ -80,32 +91,48 @@ class GPUHistBuilder {
  MetaInfo *info;
  bool initialised;
  bool is_dense;
-  DeviceGMat device_matrix;
  const DMatrix *p_last_fmat_;
-
-  dh::bulk_allocator ba;
-  dh::CubMemory cub_mem;
-  dh::dvec<int> gidx_feature_map;
-  dh::dvec<int> hist_node_segments;
-  dh::dvec<int> feature_segments;
-  dh::dvec<float> gain;
-  dh::dvec<NodeIdT> position;
-  dh::dvec<NodeIdT> position_tmp;
-  dh::dvec<float> gidx_fvalue_map;
-  dh::dvec<float> fidx_min_map;
-  DeviceHist hist;
-  dh::dvec<cub::KeyValuePair<int, float>> argmax;
-  dh::dvec<gpu_gpair> node_sums;
-  dh::dvec<gpu_gpair> hist_scan;
-  dh::dvec<gpu_gpair> device_gpair;
-  dh::dvec<Node> nodes;
-  dh::dvec<int> feature_flags;
-  dh::dvec<bool> left_child_smallest;
-  dh::dvec<bst_float> prediction_cache;
  bool prediction_cache_initialised;

+  // choose which memory type to use (DEVICE or DEVICE_MANAGED)
+  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
+  //  dh::bulk_allocator<dh::memory_type::DEVICE_MANAGED> ba; // can't be used
+  //  with NCCL
+  dh::CubMemory cub_mem;
+
  std::vector<int> feature_set_tree;
  std::vector<int> feature_set_level;
+
+  bst_uint num_rows;
+  int n_devices;
+
+  // below vectors are for each devices used
+  std::vector<int> dList;
+  std::vector<int> device_row_segments;
+  std::vector<int> device_element_segments;
+
+  std::vector<DeviceHist> hist_vec;
+  std::vector<dh::dvec<Node>> nodes;
+  std::vector<dh::dvec<Node>> nodes_temp;
+  std::vector<dh::dvec<Node>> nodes_child_temp;
+  std::vector<dh::dvec<bool>> left_child_smallest;
+  std::vector<dh::dvec<bool>> left_child_smallest_temp;
+  std::vector<dh::dvec<int>> feature_flags;
+  std::vector<dh::dvec<float>> fidx_min_map;
+  std::vector<dh::dvec<int>> feature_segments;
+  std::vector<dh::dvec<bst_float>> prediction_cache;
+  std::vector<dh::dvec<NodeIdT>> position;
+  std::vector<dh::dvec<NodeIdT>> position_tmp;
+  std::vector<DeviceGMat> device_matrix;
+  std::vector<dh::dvec<gpu_gpair>> device_gpair;
+  std::vector<dh::dvec<int>> gidx_feature_map;
+  std::vector<dh::dvec<float>> gidx_fvalue_map;
+
+  std::vector<cudaStream_t *> streams;
+#if (NCCL)
+  std::vector<ncclComm_t> comms;
+  std::vector<std::vector<ncclComm_t>> find_split_comms;
+#endif
 };
 }  // namespace tree
 }  // namespace xgboost
--- a/plugin/updater_gpu/src/types.cuh
+++ b/plugin/updater_gpu/src/types.cuh
@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory mitchell
+ * Copyright 2017 XGBoost contributors
 */
 #pragma once
 #include <thrust/device_vector.h>
--- a/plugin/updater_gpu/src/updater_gpu.cu
+++ b/plugin/updater_gpu/src/updater_gpu.cu
@ -1,5 +1,5 @@
 /*!
- * Copyright 2016 Rory Mitchell
+ * Copyright 2017 XGBoost contributors
 */
 #include <xgboost/tree_updater.h>
 #include <vector>
--- a/plugin/updater_gpu/test/python/test.py
+++ b/plugin/updater_gpu/test/python/test.py
@ -1,3 +1,4 @@
+from __future__ import print_function
 #pylint: skip-file
 import sys
 sys.path.append("../../tests/python")
@ -12,6 +13,10 @@ dpath = '../../demo/data/'
 ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
 ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')

+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+    print(*args, file=sys.stdout, **kwargs)
+        

 class TestGPU(unittest.TestCase):
    def test_grow_gpu(self):
@ -58,7 +63,7 @@ class TestGPU(unittest.TestCase):
                 'max_depth': 3,
                 'eval_metric': 'auc'}
        res = {}
-        xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
+        xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
                  evals_result=res)
        assert self.non_decreasing(res['train']['auc'])
        assert self.non_decreasing(res['test']['auc'])
@ -74,13 +79,13 @@ class TestGPU(unittest.TestCase):
                 'max_depth': 2,
                 'eval_metric': 'auc'}
        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
+        xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)

        assert self.non_decreasing(res['train']['auc'])
        assert res['train']['auc'][0] >= 0.85

        for j in range(X2.shape[1]):
-            for i in rng.choice(X2.shape[0], size=10, replace=False):
+            for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
                X2[i, j] = 2

        dtrain3 = xgb.DMatrix(X2, label=y2)
@ -92,17 +97,18 @@ class TestGPU(unittest.TestCase):
        assert res['train']['auc'][0] >= 0.85

        for j in range(X2.shape[1]):
-            for i in np.random.choice(X2.shape[0], size=10, replace=False):
+            for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
                X2[i, j] = 3

        dtrain4 = xgb.DMatrix(X2, label=y2)
        res = {}
-        xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
+        xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
        assert self.non_decreasing(res['train']['auc'])
        assert res['train']['auc'][0] >= 0.85

        
    def test_grow_gpu_hist(self):
+        n_gpus=-1
        tm._skip_if_no_sklearn()
        from sklearn.datasets import load_digits
        try:
@ -110,31 +116,59 @@ class TestGPU(unittest.TestCase):
        except:
            from sklearn.cross_validation import train_test_split

+        for max_depth in range(3,10): # TODO: Doesn't work with 2 for some tests
+            #eprint("max_depth=%d" % (max_depth))
+            
+            for max_bin_i in range(3,11):
+                max_bin = np.power(2,max_bin_i)
+                #eprint("max_bin=%d" % (max_bin))
+
+                
+            
                # regression test --- hist must be same as exact on all-categorial data
-        ag_param = {'max_depth': 2,
+                ag_param = {'max_depth': max_depth,
                            'tree_method': 'exact',
                            'nthread': 1,
                            'eta': 1,
                            'silent': 1,
                            'objective': 'binary:logistic',
                            'eval_metric': 'auc'}
-        ag_param2 = {'max_depth': 2,
+                ag_param2 = {'max_depth': max_depth,
                             'updater': 'grow_gpu_hist',
                             'eta': 1,
                             'silent': 1,
+                             'n_gpus': 1,
                             'objective': 'binary:logistic',
+                                 'max_bin': max_bin,
+                             'eval_metric': 'auc'}
+                ag_param3 = {'max_depth': max_depth,
+                             'updater': 'grow_gpu_hist',
+                             'eta': 1,
+                             'silent': 1,
+                             'n_gpus': n_gpus,
+                                 'objective': 'binary:logistic',
+                                 'max_bin': max_bin,
                             'eval_metric': 'auc'}
                ag_res = {}
                ag_res2 = {}
+                ag_res3 = {}

                num_rounds = 10
+                #eprint("normal updater");
                xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                          evals_result=ag_res)
+                #eprint("grow_gpu_hist updater 1 gpu");
                xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                          evals_result=ag_res2)
+                #eprint("grow_gpu_hist updater %d gpus" % (n_gpus));
+                xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res3)
+                #        assert 1==0
                assert ag_res['train']['auc'] == ag_res2['train']['auc']
                assert ag_res['test']['auc'] == ag_res2['test']['auc']
+                assert ag_res['test']['auc'] == ag_res3['test']['auc']

+                ######################################################################
                digits = load_digits(2)
                X = digits['data']
                y = digits['target']
@ -144,14 +178,32 @@ class TestGPU(unittest.TestCase):

                param = {'objective': 'binary:logistic',
                         'updater': 'grow_gpu_hist',
-                 'max_depth': 3,
+                         'max_depth': max_depth,
+                         'n_gpus': 1,
+                         'max_bin': max_bin,
                         'eval_metric': 'auc'}
                res = {}
-        xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
+                #eprint("digits: grow_gpu_hist updater 1 gpu");
+                xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
                          evals_result=res)
                assert self.non_decreasing(res['train']['auc'])
-        assert self.non_decreasing(res['test']['auc'])
+                #assert self.non_decreasing(res['test']['auc'])
+                param2 = {'objective': 'binary:logistic',
+                          'updater': 'grow_gpu_hist',
+                          'max_depth': max_depth,
+                          'n_gpus': n_gpus,
+                          'max_bin': max_bin,
+                          'eval_metric': 'auc'}
+                res2 = {}
+                #eprint("digits: grow_gpu_hist updater %d gpus" % (n_gpus));
+                xgb.train(param2, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
+                          evals_result=res2)
+                assert self.non_decreasing(res2['train']['auc'])
+                #assert self.non_decreasing(res2['test']['auc'])
+                assert res['train']['auc'] == res2['train']['auc']
+                #assert res['test']['auc'] == res2['test']['auc']

+                ######################################################################
                # fail-safe test for dense data
                from sklearn.datasets import load_svmlight_file
                X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
@ -160,16 +212,19 @@ class TestGPU(unittest.TestCase):

                param = {'objective': 'binary:logistic',
                         'updater': 'grow_gpu_hist',
-                 'max_depth': 2,
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'max_bin': max_bin,
                         'eval_metric': 'auc'}
                res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)

                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
                    assert res['train']['auc'][0] >= 0.85

                for j in range(X2.shape[1]):
-            for i in rng.choice(X2.shape[0], size=10, replace=False):
+                    for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
                        X2[i, j] = 2

                dtrain3 = xgb.DMatrix(X2, label=y2)
@ -178,54 +233,63 @@ class TestGPU(unittest.TestCase):
                xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)

                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
                    assert res['train']['auc'][0] >= 0.85

                for j in range(X2.shape[1]):
-            for i in np.random.choice(X2.shape[0], size=10, replace=False):
+                    for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
                        X2[i, j] = 3

                dtrain4 = xgb.DMatrix(X2, label=y2)
                res = {}
-        xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
+                xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
                    assert res['train']['auc'][0] >= 0.85

+                ######################################################################
+                # fail-safe test for max_bin
+                param = {'objective': 'binary:logistic',
+                         'updater': 'grow_gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'eval_metric': 'auc',
+                         'max_bin': max_bin}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+                ######################################################################
+                # subsampling
+                param = {'objective': 'binary:logistic',
+                         'updater': 'grow_gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'eval_metric': 'auc',
+                         'colsample_bytree': 0.5,
+                         'colsample_bylevel': 0.5,
+                         'subsample': 0.5,
+                         'max_bin': max_bin}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+        ######################################################################
        # fail-safe test for max_bin=2
        param = {'objective': 'binary:logistic',
                 'updater': 'grow_gpu_hist',
                 'max_depth': 2,
+                 'n_gpus': n_gpus,
                 'eval_metric': 'auc',
                 'max_bin': 2}
        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
+        xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
        assert self.non_decreasing(res['train']['auc'])
+        if max_bin>32:
            assert res['train']['auc'][0] >= 0.85
        
-        # subsampling
-        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
-                 'max_depth': 3,
-                 'eval_metric': 'auc',
-                 'colsample_bytree': 0.5,
-                 'colsample_bylevel': 0.5,
-                 'subsample': 0.5
-                 }
-        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
-
-        # max_bin = 2048
-        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
-                 'max_depth': 3,
-                 'eval_metric': 'auc',
-                 'max_bin': 2048
-                 }
-        res = {}
-        xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
-        assert self.non_decreasing(res['train']['auc'])
-        assert res['train']['auc'][0] >= 0.85
        
    def non_decreasing(self, L):
            return all((x - y) < 0.001 for x, y in zip(L, L[1:]))
--- a/src/tree/param.h
+++ b/src/tree/param.h
@ -81,6 +81,8 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  std::vector<int> monotone_constraints;
  // gpu to use for single gpu algorithms
  int gpu_id;
+  // number of GPUs to use
+  int n_gpus;
  // declare the parameters
  DMLC_DECLARE_PARAMETER(TrainParam) {
    DMLC_DECLARE_FIELD(learning_rate)
@ -192,6 +194,10 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
        .set_lower_bound(0)
        .set_default(0)
        .describe("gpu to use for single gpu algorithms");
+    DMLC_DECLARE_FIELD(n_gpus)
+        .set_lower_bound(-1)
+        .set_default(-1)
+        .describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
    // add alias of parameters
    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
				`@ -0,0 +1 @@`
				`Subproject commit 93183bca921b2e8e1754e27e1b43d73cf6caec9d`