[GPU-Plugin] Change GPU plugin to use tree_method parameter, bump cmake version to 3.5 for GPU plugin, add compute architecture 3.5, remove unused cmake files (#2455)

2017-06-29 16:19:45 +12:00 · 2017-06-29 16:19:45 +12:00 · 48f3003302
commit 48f3003302
parent 88488fdbb9
8 changed files with 168 additions and 835 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,9 +3,10 @@ project (xgboost)
 find_package(OpenMP)

 option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
-set(GPU_COMPUTE_VER 50;52;60;61 CACHE STRING
+set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
    "Space separated list of compute versions to be built against")
 if(PLUGIN_UPDATER_GPU)
+  cmake_minimum_required (VERSION 3.5)
  find_package(CUDA REQUIRED)
 endif()

--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@ -1,289 +0,0 @@
-
-include(CheckCXXCompilerFlag)
-check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
-
-################################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   mshadow_detect_installed_gpus(out_variable)
-function(xgboost_detect_installed_gpus out_variable)
-set(CUDA_gpu_detect_output "")
-  if(NOT CUDA_gpu_detect_output)
-    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${__cufile} ""
-      "#include <cstdio>\n"
-      "int main()\n"
-      "{\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device)\n"
-      "  {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-    if(MSVC)
-      #find vcvarsall.bat and run it building msvc environment
-      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
-      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
-      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
-                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
-                      ERROR_QUIET
-                      OUTPUT_STRIP_TRAILING_WHITESPACE)
-    else()
-      if(CUDA_LIBRARY_PATH)
-        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
-      endif()
-      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
-                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
-                      ERROR_QUIET
-                      OUTPUT_STRIP_TRAILING_WHITESPACE)
-    endif()
-    if(__nvcc_res EQUAL 0)
-      # nvcc outputs text containing line breaks when building with MSVC.
-      # The line below prevents CMake from inserting a variable with line
-      # breaks in the cache
-      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
-      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
-      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from xgboost_detect_gpus tool" FORCE)
-    else()
-      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${xgboost_known_gpu_archs}).")
-    set(${out_variable} ${xgboost_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-################################################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
-# Usage:
-#   xgboost_select_nvcc_arch_flags(out_variable)
-function(xgboost_select_nvcc_arch_flags out_variable)
-  # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
-  set(__archs_name_default "All")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND __archs_names "Auto")
-    set(__archs_name_default "Auto")
-  endif()
-
-  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
-  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
-  mark_as_advanced(CUDA_ARCH_NAME)
-
-  # verify CUDA_ARCH_NAME value
-  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
-    string(REPLACE ";" ", " __archs_names "${__archs_names}")
-    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(CUDA_ARCH_BIN ${xgboost_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
-    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
-  else()
-    unset(CUDA_ARCH_BIN CACHE)
-    unset(CUDA_ARCH_PTX CACHE)
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
-    set(__cuda_arch_bin "20 21(20)")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(__cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(__cuda_arch_bin "50")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(__cuda_arch_bin "60 61")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
-    set(__cuda_arch_bin ${xgboost_known_gpu_archs})
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
-    xgboost_detect_installed_gpus(__cuda_arch_bin)
-  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
-  endif()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
-  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
-  xgboost_list_unique(__cuda_arch_bin __cuda_arch_ptx)
-
-  set(__nvcc_flags "")
-  set(__nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(__arch ${__cuda_arch_bin})
-    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
-      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
-      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
-      list(APPEND __nvcc_archs_readable sm_${__arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(__arch ${__cuda_arch_ptx})
-    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
-    list(APPEND __nvcc_archs_readable compute_${__arch})
-  endforeach()
-
-  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
-  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Short command for cuda comnpilation
-# Usage:
-#   xgboost_cuda_compile(<objlist_variable> <cuda_files>)
-macro(xgboost_cuda_compile objlist_variable)
-  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-    set(${var}_backup_in_cuda_compile_ "${${var}}")
-
-    # we remove /EHa as it generates warnings under windows
-    string(REPLACE "/EHa" "" ${var} "${${var}}")
-
-  endforeach()
-  if(UNIX OR APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
-  endif()
-
-  if(APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
-  endif()
-
-  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G -lineinfo")
-
-  if(MSVC)
-    # disable noisy warnings:
-    # 4819: The file contains a character that cannot be represented in the current code page (number).
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-  endif()
-
-  # If the build system is a container, make sure the nvcc intermediate files
-  # go into the build output area rather than in /tmp, which may run out of space
-  if(IS_CONTAINER_BUILD)
-    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
-    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
-  endif()
-
-  cuda_compile(cuda_objcs ${ARGN})
-
-  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-    set(${var} "${${var}_backup_in_cuda_compile_}")
-    unset(${var}_backup_in_cuda_compile_)
-  endforeach()
-
-  set(${objlist_variable} ${cuda_objcs})
-endmacro()
-
-
-################################################################################################
-###  Non macro section
-################################################################################################
-
-# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
-if(NOT CUDA_TOOLKIT_ROOT_DIR)
-  find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
-  if(CUDA_LIBRARY_PATH)
-    get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
-    set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
-  endif()
-endif()
-
-find_package(CUDA 5.5 QUIET REQUIRED)
-find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
-
-if(NOT CUDA_FOUND)
-  return()
-endif()
-
-set(HAVE_CUDA TRUE)
-message(STATUS "CUDA detected: " ${CUDA_VERSION})
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-list(APPEND xgboost_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
-                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-
-# Known NVIDIA GPU achitectures xgboost can be compiled for.
-# This list will be used for CUDA_ARCH_NAME = All option
-if(CUDA_ARCH_ALL)
-  set(xgboost_known_gpu_archs "${CUDA_ARCH_ALL}")
-else()
-  if(${CUDA_VERSION} GREATER 7.5)
-    set(xgboost_known_gpu_archs "30 35 50 52 60 61")
-  else()
-    set(xgboost_known_gpu_archs "30 35 50 52")
-  endif()
-endif()
-
-# cudnn detection
-if(USE_CUDNN)
-  detect_cuDNN()
-  if(HAVE_CUDNN)
-    add_definitions(-DUSE_CUDNN)
-    include_directories(SYSTEM ${CUDNN_INCLUDE})
-    list(APPEND xgboost_LINKER_LIBS ${CUDNN_LIBRARY})
-  endif()
-endif()
-
-# setting nvcc arch flags
-xgboost_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
-list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
-
-# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
-# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
-if(Boost_VERSION EQUAL 105500)
-  message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
-  # avoid warning for CMake >= 2.8.12
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
-endif()
-
-# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
-foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
-  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
-endforeach()
-
-# setting default testing device
-if(NOT CUDA_TEST_DEVICE)
-  set(CUDA_TEST_DEVICE -1)
-endif()
-
-mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
-mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
-
-# Handle clang/libc++ issue
-if(APPLE)
-  xgboost_detect_darwin_version(OSX_VERSION)
-
-  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
-  if(OSX_VERSION VERSION_GREATER 10.8)
-    # enabled by default if and only if CUDA version is less than 7.0
-    xgboost_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
-  endif()
-endif()
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -1,398 +0,0 @@
-################################################################################################
-# Command alias for debugging messages
-# Usage:
-#   dmsg(<message>)
-function(dmsg)
-  message(STATUS ${ARGN})
-endfunction()
-
-################################################################################################
-# Removes duplicates from list(s)
-# Usage:
-#   xgboost_list_unique(<list_variable> [<list_variable>] [...])
-macro(xgboost_list_unique)
-  foreach(__lst ${ARGN})
-    if(${__lst})
-      list(REMOVE_DUPLICATES ${__lst})
-    endif()
-  endforeach()
-endmacro()
-
-################################################################################################
-# Clears variables from list
-# Usage:
-#   xgboost_clear_vars(<variables_list>)
-macro(xgboost_clear_vars)
-  foreach(_var ${ARGN})
-    unset(${_var})
-  endforeach()
-endmacro()
-
-################################################################################################
-# Removes duplicates from string
-# Usage:
-#   xgboost_string_unique(<string_variable>)
-function(xgboost_string_unique __string)
-  if(${__string})
-    set(__list ${${__string}})
-    separate_arguments(__list)
-    list(REMOVE_DUPLICATES __list)
-    foreach(__e ${__list})
-      set(__str "${__str} ${__e}")
-    endforeach()
-    set(${__string} ${__str} PARENT_SCOPE)
-  endif()
-endfunction()
-
-################################################################################################
-# Prints list element per line
-# Usage:
-#   xgboost_print_list(<list>)
-function(xgboost_print_list)
-  foreach(e ${ARGN})
-    message(STATUS ${e})
-  endforeach()
-endfunction()
-
-################################################################################################
-# Function merging lists of compiler flags to single string.
-# Usage:
-#   xgboost_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
-function(xgboost_merge_flag_lists out_var)
-  set(__result "")
-  foreach(__list ${ARGN})
-    foreach(__flag ${${__list}})
-      string(STRIP ${__flag} __flag)
-      set(__result "${__result} ${__flag}")
-    endforeach()
-  endforeach()
-  string(STRIP ${__result} __result)
-  set(${out_var} ${__result} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Converts all paths in list to absolute
-# Usage:
-#   xgboost_convert_absolute_paths(<list_variable>)
-function(xgboost_convert_absolute_paths variable)
-  set(__dlist "")
-  foreach(__s ${${variable}})
-    get_filename_component(__abspath ${__s} ABSOLUTE)
-    list(APPEND __list ${__abspath})
-  endforeach()
-  set(${variable} ${__list} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Reads set of version defines from the header file
-# Usage:
-#   xgboost_parse_header(<file> <define1> <define2> <define3> ..)
-macro(xgboost_parse_header FILENAME FILE_VAR)
-  set(vars_regex "")
-  set(__parnet_scope OFF)
-  set(__add_cache OFF)
-  foreach(name ${ARGN})
-    if("${name}" STREQUAL "PARENT_SCOPE")
-      set(__parnet_scope ON)
-    elseif("${name}" STREQUAL "CACHE")
-      set(__add_cache ON)
-    elseif(vars_regex)
-      set(vars_regex "${vars_regex}|${name}")
-    else()
-      set(vars_regex "${name}")
-    endif()
-  endforeach()
-  if(EXISTS "${FILENAME}")
-    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
-  else()
-    unset(${FILE_VAR})
-  endif()
-  foreach(name ${ARGN})
-    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
-      if(${FILE_VAR})
-        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
-          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
-        else()
-          set(${name} "")
-        endif()
-        if(__add_cache)
-          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
-        elseif(__parnet_scope)
-          set(${name} "${${name}}" PARENT_SCOPE)
-        endif()
-      else()
-        unset(${name} CACHE)
-      endif()
-    endif()
-  endforeach()
-endmacro()
-
-################################################################################################
-# Reads single version define from the header file and parses it
-# Usage:
-#   xgboost_parse_header_single_define(<library_name> <file> <define_name>)
-function(xgboost_parse_header_single_define LIBNAME HDR_PATH VARNAME)
-  set(${LIBNAME}_H "")
-  if(EXISTS "${HDR_PATH}")
-    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
-  endif()
-
-  if(${LIBNAME}_H)
-    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
-    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
-    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
-    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
-    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
-    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
-    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
-
-    # append a TWEAK version if it exists:
-    set(${LIBNAME}_VERSION_TWEAK "")
-    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
-      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
-    endif()
-    if(${LIBNAME}_VERSION_TWEAK)
-      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
-    else()
-      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
-    endif()
-  endif()
-endfunction()
-
-########################################################################################################
-# An option that the user can select. Can accept condition to control when option is available for user.
-# Usage:
-#   xgboost_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
-function(xgboost_option variable description value)
-  set(__value ${value})
-  set(__condition "")
-  set(__varname "__value")
-  foreach(arg ${ARGN})
-    if(arg STREQUAL "IF" OR arg STREQUAL "if")
-      set(__varname "__condition")
-    else()
-      list(APPEND ${__varname} ${arg})
-    endif()
-  endforeach()
-  unset(__varname)
-  if("${__condition}" STREQUAL "")
-    set(__condition 2 GREATER 1)
-  endif()
-
-  if(${__condition})
-    if("${__value}" MATCHES ";")
-      if(${__value})
-        option(${variable} "${description}" ON)
-      else()
-        option(${variable} "${description}" OFF)
-      endif()
-    elseif(DEFINED ${__value})
-      if(${__value})
-        option(${variable} "${description}" ON)
-      else()
-        option(${variable} "${description}" OFF)
-      endif()
-    else()
-      option(${variable} "${description}" ${__value})
-    endif()
-  else()
-    unset(${variable} CACHE)
-  endif()
-endfunction()
-
-################################################################################################
-# Utility macro for comparing two lists. Used for CMake debugging purposes
-# Usage:
-#   xgboost_compare_lists(<list_variable> <list2_variable> [description])
-function(xgboost_compare_lists list1 list2 desc)
-  set(__list1 ${${list1}})
-  set(__list2 ${${list2}})
-  list(SORT __list1)
-  list(SORT __list2)
-  list(LENGTH __list1 __len1)
-  list(LENGTH __list2 __len2)
-
-  if(NOT ${__len1} EQUAL ${__len2})
-    message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
-  endif()
-
-  foreach(__i RANGE 1 ${__len1})
-    math(EXPR __index "${__i}- 1")
-    list(GET __list1 ${__index} __item1)
-    list(GET __list2 ${__index} __item2)
-    if(NOT ${__item1} STREQUAL ${__item2})
-      message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
-    endif()
-  endforeach()
-endfunction()
-
-################################################################################################
-# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
-# Usage:
-#   xgboost_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
-macro(xgboost_warnings_disable)
-  set(_flag_vars "")
-  set(_msvc_warnings "")
-  set(_gxx_warnings "")
-
-  foreach(arg ${ARGN})
-    if(arg MATCHES "^CMAKE_")
-      list(APPEND _flag_vars ${arg})
-    elseif(arg MATCHES "^/wd")
-      list(APPEND _msvc_warnings ${arg})
-    elseif(arg MATCHES "^-W")
-      list(APPEND _gxx_warnings ${arg})
-    endif()
-  endforeach()
-
-  if(NOT _flag_vars)
-    set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  endif()
-
-  if(MSVC AND _msvc_warnings)
-    foreach(var ${_flag_vars})
-      foreach(warning ${_msvc_warnings})
-        set(${var} "${${var}} ${warning}")
-      endforeach()
-    endforeach()
-  elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
-    foreach(var ${_flag_vars})
-      foreach(warning ${_gxx_warnings})
-        if(NOT warning MATCHES "^-Wno-")
-          string(REPLACE "${warning}" "" ${var} "${${var}}")
-          string(REPLACE "-W" "-Wno-" warning "${warning}")
-        endif()
-        set(${var} "${${var}} ${warning}")
-      endforeach()
-    endforeach()
-  endif()
-  xgboost_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
-endmacro()
-
-################################################################################################
-# Helper function get current definitions
-# Usage:
-#   xgboost_get_current_definitions(<definitions_variable>)
-function(xgboost_get_current_definitions definitions_var)
-  get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
-  set(result "")
-
-  foreach(d ${current_definitions})
-    list(APPEND result -D${d})
-  endforeach()
-
-  xgboost_list_unique(result)
-  set(${definitions_var} ${result} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Helper function get current includes/definitions
-# Usage:
-#   xgboost_get_current_cflags(<cflagslist_variable>)
-function(xgboost_get_current_cflags cflags_var)
-  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
-  xgboost_convert_absolute_paths(current_includes)
-  xgboost_get_current_definitions(cflags)
-
-  foreach(i ${current_includes})
-    list(APPEND cflags "-I${i}")
-  endforeach()
-
-  xgboost_list_unique(cflags)
-  set(${cflags_var} ${cflags} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Helper function to parse current linker libs into link directories, libflags and osx frameworks
-# Usage:
-#   xgboost_parse_linker_libs(<xgboost_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
-function(xgboost_parse_linker_libs xgboost_LINKER_LIBS_variable folders_var flags_var frameworks_var)
-
-  set(__unspec "")
-  set(__debug "")
-  set(__optimized "")
-  set(__framework "")
-  set(__varname "__unspec")
-
-  # split libs into debug, optimized, unspecified and frameworks
-  foreach(list_elem ${${xgboost_LINKER_LIBS_variable}})
-    if(list_elem STREQUAL "debug")
-      set(__varname "__debug")
-    elseif(list_elem STREQUAL "optimized")
-      set(__varname "__optimized")
-    elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
-      list(APPEND __framework -framework ${CMAKE_MATCH_1})
-    else()
-      list(APPEND ${__varname} ${list_elem})
-      set(__varname "__unspec")
-    endif()
-  endforeach()
-
-  # attach debug or optimized libs to unspecified according to current configuration
-  if(CMAKE_BUILD_TYPE MATCHES "Debug")
-    set(__libs ${__unspec} ${__debug})
-  else()
-    set(__libs ${__unspec} ${__optimized})
-  endif()
-
-  set(libflags "")
-  set(folders "")
-
-  # convert linker libraries list to link flags
-  foreach(lib ${__libs})
-    if(TARGET ${lib})
-      list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
-      list(APPEND libflags -l${lib})
-    elseif(lib MATCHES "^-l.*")
-      list(APPEND libflags ${lib})
-    elseif(IS_ABSOLUTE ${lib})
-      get_filename_component(name_we ${lib} NAME_WE)
-      get_filename_component(folder  ${lib} PATH)
-
-      string(REGEX MATCH "^lib(.*)" __match ${name_we})
-      list(APPEND libflags -l${CMAKE_MATCH_1})
-      list(APPEND folders    ${folder})
-    else()
-      message(FATAL_ERROR "Logic error. Need to update cmake script")
-    endif()
-  endforeach()
-
-  xgboost_list_unique(libflags folders)
-
-  set(${folders_var} ${folders} PARENT_SCOPE)
-  set(${flags_var} ${libflags} PARENT_SCOPE)
-  set(${frameworks_var} ${__framework} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
-# Usage:
-#   xgboost_detect_darwin_version(<version_variable>)
-function(xgboost_detect_darwin_version output_var)
-  if(APPLE)
-    execute_process(COMMAND /usr/bin/sw_vers -productVersion
-                    RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
-  else()
-    set(${output_var} "" PARENT_SCOPE)
-  endif()
-endfunction()
-
-################################################################################################
-# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
-# Usage:
-#   caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
-function(xgboost_source_group group)
-  cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
-  if(CAFFE_SOURCE_GROUP_GLOB)
-    file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
-    source_group(${group} FILES ${srcs1})
-  endif()
-
-  if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
-    file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
-    source_group(${group} FILES ${srcs2})
-  endif()
-endfunction()
--- a/demo/gpu_acceleration/bosch.py
+++ b/demo/gpu_acceleration/bosch.py
@ -24,8 +24,7 @@ param['eval_metric'] = 'auc'
 param['max_depth'] = 5
 param['eta'] = 0.3
 param['silent'] = 0
-param['updater'] = 'grow_gpu'
-#param['updater'] = 'grow_colmaker'
+param['tree_method'] = 'gpu_exact'

 num_round = 20

--- a/plugin/updater_gpu/README.md
+++ b/plugin/updater_gpu/README.md
@ -1,16 +1,16 @@
 # CUDA Accelerated Tree Construction Algorithms
 This plugin adds GPU accelerated tree construction algorithms to XGBoost.
 ## Usage
-Specify the 'updater' parameter as one of the following algorithms. 
+Specify the 'tree_method' parameter as one of the following algorithms. 

 ### Algorithms
-| updater | Description |
+| tree_method | Description |
 | --- | --- |
-grow_gpu | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'grow_gpu_hist' |
-grow_gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Faster and uses considerably less memory. Splits may be less accurate. |
+gpu_exact | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'gpu_hist' |
+gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Faster and uses considerably less memory. Splits may be less accurate. |

 ### Supported parameters 
-| parameter | grow_gpu | grow_gpu_hist |
+| parameter | gpu_exact | gpu_hist |
 | --- | --- | --- |
 subsample | &#10004; | &#10004; |
 colsample_bytree | &#10004; | &#10004;|
@ -29,7 +29,7 @@ Python example:
 ```python
 param['gpu_id'] = 1
 param['max_bin'] = 16
-param['updater'] = 'grow_gpu_hist'
+param['tree_method'] = 'gpu_hist'
 ```
 ## Benchmarks
 To run benchmarks on synthetic data for binary classification:
@ -39,18 +39,18 @@ $ python benchmark/benchmark.py

 Training time time on 1000000 rows x 50 columns with 500 boosting iterations on i7-6700K CPU @ 4.00GHz and Pascal Titan X.

-| Updater | Time (s) |
+| tree_method | Time (s) |
 | --- | --- |
-| grow_gpu_hist | 11.09 |
-| grow_fast_histmaker (histogram XGBoost - CPU) | 41.75 |
-| grow_gpu | 193.90 |
-| grow_colmaker (standard XGBoost - CPU) | 720.12 |
+| gpu_hist | 11.09 |
+| hist (histogram XGBoost - CPU) | 41.75 |
+| gpu_exact | 193.90 |
+| exact (standard XGBoost - CPU) | 720.12 |


-[See here](http://dmlc.ml/2016/12/14/GPU-accelerated-xgboost.html) for additional performance benchmarks of the 'grow_gpu' updater.
+[See here](http://dmlc.ml/2016/12/14/GPU-accelerated-xgboost.html) for additional performance benchmarks of the 'gpu_exact' tree_method.

 ## Test
-To run tests:
+To run tests:Will
 ```bash
 $ python -m nose test/python/
 ```
@ -122,6 +122,13 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
 ```

 ## Changelog
+##### 2017/6/26
+
+* Change API to use tree_method parameter
+* Increase required cmake version to 3.5
+* Add compute arch 3.5 to default archs
+* Set default n_gpus to 1
+
 ##### 2017/6/5

 * Multi-GPU support for histogram method using NVIDIA NCCL.
--- a/plugin/updater_gpu/benchmark/benchmark.py
+++ b/plugin/updater_gpu/benchmark/benchmark.py
@ -14,19 +14,18 @@ def run_benchmark(args, gpu_algorithm, cpu_algorithm):
    dtrain = xgb.DMatrix(X, y)

    param = {'objective': 'binary:logistic',
-             'tree_method': 'exact',
             'max_depth': 6,
             'silent': 1,
             'eval_metric': 'auc'}

-    param['updater'] = gpu_algorithm
-    print("Training with '%s'" % param['updater'])
+    param['tree_method'] = gpu_algorithm
+    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
    print ("Time: %s seconds" % (str(time.time() - tmp)))

-    param['updater'] = cpu_algorithm
-    print("Training with '%s'" % param['updater'])
+    param['tree_method'] = cpu_algorithm
+    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
    print ("Time: %s seconds" % (str(time.time() - tmp)))
@ -34,17 +33,17 @@ def run_benchmark(args, gpu_algorithm, cpu_algorithm):


 parser = argparse.ArgumentParser()
-parser.add_argument('--algorithm', choices=['all', 'grow_gpu', 'grow_gpu_hist'], required=True)
+parser.add_argument('--algorithm', choices=['all', 'gpu_exact', 'gpu_hist'], default='all')
 parser.add_argument('--rows',type=int,default=1000000)
 parser.add_argument('--columns',type=int,default=50)
 parser.add_argument('--iterations',type=int,default=500)
 args = parser.parse_args()

-if 'grow_gpu_hist' in args.algorithm:
-    run_benchmark(args, args.algorithm, 'grow_fast_histmaker')
-if 'grow_gpu' in args.algorithm:
-    run_benchmark(args, args.algorithm, 'grow_colmaker')
+if 'gpu_hist' in args.algorithm:
+    run_benchmark(args, args.algorithm, 'hist')
+if 'gpu_exact' in args.algorithm:
+    run_benchmark(args, args.algorithm, 'exact')
 if 'all' in args.algorithm:
-    run_benchmark(args, 'grow_gpu', 'grow_colmaker')
-    run_benchmark(args, 'grow_gpu_hist', 'grow_fast_histmaker')
+    run_benchmark(args, 'gpu_exact', 'exact')
+    run_benchmark(args, 'gpu_hist', 'hist')

--- a/plugin/updater_gpu/test/python/test.py
+++ b/plugin/updater_gpu/test/python/test.py
@ -35,7 +35,7 @@ class TestGPU(unittest.TestCase):
                    'objective': 'binary:logistic',
                    'eval_metric': 'auc'}
        ag_param2 = {'max_depth': 2,
-                     'updater': 'grow_gpu',
+                     'tree_method': 'gpu_exact',
                     'eta': 1,
                     'silent': 1,
                     'objective': 'binary:logistic',
@ -59,7 +59,7 @@ class TestGPU(unittest.TestCase):
        dtest = xgb.DMatrix(X_test, y_test)

        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu',
+                 'tree_method': 'gpu_exact',
                 'max_depth': 3,
                 'eval_metric': 'auc'}
        res = {}
@ -75,7 +75,7 @@ class TestGPU(unittest.TestCase):
        dtrain2 = xgb.DMatrix(X2, label=y2)

        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu',
+                 'tree_method': 'gpu_exact',
                 'max_depth': 2,
                 'eval_metric': 'auc'}
        res = {}
@ -134,7 +134,7 @@ class TestGPU(unittest.TestCase):
                            'objective': 'binary:logistic',
                            'eval_metric': 'auc'}
                ag_param2 = {'max_depth': max_depth,
-                             'updater': 'grow_gpu_hist',
+                             'tree_method': 'gpu_hist',
                             'eta': 1,
                             'silent': 1,
                             'n_gpus': 1,
@ -142,7 +142,7 @@ class TestGPU(unittest.TestCase):
                                 'max_bin': max_bin,
                             'eval_metric': 'auc'}
                ag_param3 = {'max_depth': max_depth,
-                             'updater': 'grow_gpu_hist',
+                             'tree_method': 'gpu_hist',
                             'eta': 1,
                             'silent': 1,
                             'n_gpus': n_gpus,
@ -177,7 +177,7 @@ class TestGPU(unittest.TestCase):
                dtest = xgb.DMatrix(X_test, y_test)

                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': 1,
                         'max_bin': max_bin,
@ -189,7 +189,7 @@ class TestGPU(unittest.TestCase):
                assert self.non_decreasing(res['train']['auc'])
                #assert self.non_decreasing(res['test']['auc'])
                param2 = {'objective': 'binary:logistic',
-                          'updater': 'grow_gpu_hist',
+                          'tree_method': 'gpu_hist',
                          'max_depth': max_depth,
                          'n_gpus': n_gpus,
                          'max_bin': max_bin,
@ -211,7 +211,7 @@ class TestGPU(unittest.TestCase):
                dtrain2 = xgb.DMatrix(X2, label=y2)

                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
                         'max_bin': max_bin,
@ -250,7 +250,7 @@ class TestGPU(unittest.TestCase):
                ######################################################################
                # fail-safe test for max_bin
                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
                         'eval_metric': 'auc',
@ -263,7 +263,7 @@ class TestGPU(unittest.TestCase):
                ######################################################################
                # subsampling
                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
                         'eval_metric': 'auc',
@ -279,7 +279,7 @@ class TestGPU(unittest.TestCase):
        ######################################################################
        # fail-safe test for max_bin=2
        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
+                 'tree_method': 'gpu_hist',
                 'max_depth': 2,
                 'n_gpus': n_gpus,
                 'eval_metric': 'auc',
--- a/src/learner.cc
+++ b/src/learner.cc
@ -4,19 +4,19 @@
 * \brief Implementation of learning algorithm.
 * \author Tianqi Chen
 */
-#include <xgboost/logging.h>
-#include <xgboost/learner.h>
-#include <dmlc/timer.h>
 #include <dmlc/io.h>
+#include <dmlc/timer.h>
+#include <xgboost/learner.h>
+#include <xgboost/logging.h>
 #include <algorithm>
-#include <vector>
-#include <utility>
-#include <string>
-#include <sstream>
-#include <limits>
 #include <iomanip>
-#include "./common/io.h"
+#include <limits>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 #include "./common/common.h"
+#include "./common/io.h"
 #include "./common/random.h"

 namespace xgboost {
@ -25,17 +25,14 @@ bool Learner::AllowLazyCheckPoint() const {
  return gbm_->AllowLazyCheckPoint();
 }

-std::vector<std::string>
-Learner::DumpModel(const FeatureMap& fmap,
+std::vector<std::string> Learner::DumpModel(const FeatureMap& fmap,
                                            bool with_stats,
                                            std::string format) const {
  return gbm_->DumpModel(fmap, with_stats, format);
 }

-
 /*! \brief training parameter for regression */
-struct LearnerModelParam
-    : public dmlc::Parameter<LearnerModelParam> {
+struct LearnerModelParam : public dmlc::Parameter<LearnerModelParam> {
  /* \brief global bias */
  bst_float base_score;
  /* \brief number of features  */
@ -55,20 +52,21 @@ struct LearnerModelParam
  }
  // declare parameters
  DMLC_DECLARE_PARAMETER(LearnerModelParam) {
-    DMLC_DECLARE_FIELD(base_score).set_default(0.5f)
+    DMLC_DECLARE_FIELD(base_score)
+        .set_default(0.5f)
        .describe("Global bias of the model.");
-    DMLC_DECLARE_FIELD(num_feature).set_default(0)
-        .describe("Number of features in training data,"\
+    DMLC_DECLARE_FIELD(num_feature)
+        .set_default(0)
+        .describe(
+            "Number of features in training data,"
            " this parameter will be automatically detected by learner.");
-    DMLC_DECLARE_FIELD(num_class).set_default(0).set_lower_bound(0)
-        .describe("Number of class option for multi-class classifier. "\
+    DMLC_DECLARE_FIELD(num_class).set_default(0).set_lower_bound(0).describe(
+        "Number of class option for multi-class classifier. "
        " By default equals 0 and corresponds to binary classifier.");
  }
 };

-
-struct LearnerTrainParam
-    : public dmlc::Parameter<LearnerTrainParam> {
+struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
  // stored random seed
  int seed;
  // whether seed the PRNG each iteration
@ -90,30 +88,40 @@ struct LearnerTrainParam
  int debug_verbose;
  // declare parameters
  DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
-    DMLC_DECLARE_FIELD(seed).set_default(0)
-        .describe("Random number seed during training.");
-    DMLC_DECLARE_FIELD(seed_per_iteration).set_default(false)
-        .describe("Seed PRNG determnisticly via iterator number, "\
-                  "this option will be switched on automatically on distributed mode.");
-    DMLC_DECLARE_FIELD(dsplit).set_default(0)
+    DMLC_DECLARE_FIELD(seed).set_default(0).describe(
+        "Random number seed during training.");
+    DMLC_DECLARE_FIELD(seed_per_iteration)
+        .set_default(false)
+        .describe(
+            "Seed PRNG determnisticly via iterator number, "
+            "this option will be switched on automatically on distributed "
+            "mode.");
+    DMLC_DECLARE_FIELD(dsplit)
+        .set_default(0)
        .add_enum("auto", 0)
        .add_enum("col", 1)
        .add_enum("row", 2)
        .describe("Data split mode for distributed training.");
-    DMLC_DECLARE_FIELD(tree_method).set_default(0)
+    DMLC_DECLARE_FIELD(tree_method)
+        .set_default(0)
        .add_enum("auto", 0)
        .add_enum("approx", 1)
        .add_enum("exact", 2)
        .add_enum("hist", 3)
+        .add_enum("gpu_exact", 4)
+        .add_enum("gpu_hist", 5)
        .describe("Choice of tree construction method.");
-    DMLC_DECLARE_FIELD(test_flag).set_default("")
-        .describe("Internal test flag");
-    DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
+    DMLC_DECLARE_FIELD(test_flag).set_default("").describe(
+        "Internal test flag");
+    DMLC_DECLARE_FIELD(prob_buffer_row)
+        .set_default(1.0f)
+        .set_range(0.0f, 1.0f)
        .describe("Maximum buffered row portion");
-    DMLC_DECLARE_FIELD(max_row_perbatch).set_default(std::numeric_limits<size_t>::max())
+    DMLC_DECLARE_FIELD(max_row_perbatch)
+        .set_default(std::numeric_limits<size_t>::max())
        .describe("maximum row per batch.");
-    DMLC_DECLARE_FIELD(nthread).set_default(0)
-        .describe("Number of threads to use.");
+    DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
+        "Number of threads to use.");
    DMLC_DECLARE_FIELD(debug_verbose)
        .set_lower_bound(0)
        .set_default(0)
@ -125,8 +133,8 @@ DMLC_REGISTER_PARAMETER(LearnerModelParam);
 DMLC_REGISTER_PARAMETER(LearnerTrainParam);

 /*!
- * \brief learner that performs gradient boosting for a specific objective function.
- *  It does training and prediction.
+ * \brief learner that performs gradient boosting for a specific objective
+ * function. It does training and prediction.
 */
 class LearnerImpl : public Learner {
 public:
@ -137,7 +145,34 @@ class LearnerImpl : public Learner {
    name_gbm_ = "gbtree";
  }

-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+  void ConfigureUpdaters() {
+    if (tparam.tree_method == 0 || tparam.tree_method == 1 ||
+        tparam.tree_method == 2) {
+      if (cfg_.count("updater") == 0) {
+        if (tparam.dsplit == 1) {
+          cfg_["updater"] = "distcol";
+        } else if (tparam.dsplit == 2) {
+          cfg_["updater"] = "grow_histmaker,prune";
+        }
+        if (tparam.prob_buffer_row != 1.0f) {
+          cfg_["updater"] = "grow_histmaker,refresh,prune";
+        }
+      }
+    } else if (tparam.tree_method == 3) {
+      /* histogram-based algorithm */
+      LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a "
+                      "single updater "
+                   << "grow_fast_histmaker.";
+      cfg_["updater"] = "grow_fast_histmaker";
+    } else if (tparam.tree_method == 4) {
+      cfg_["updater"] = "grow_gpu,prune";
+    } else if (tparam.tree_method == 5) {
+      cfg_["updater"] = "grow_gpu_hist";
+    }
+  }
+
+  void Configure(
+      const std::vector<std::pair<std::string, std::string> >& args) override {
    // add to configurations
    tparam.InitAllowUnknown(args);
    cfg_.clear();
@ -172,27 +207,13 @@ class LearnerImpl : public Learner {
      }
    }

-    if (cfg_.count("max_delta_step") == 0 &&
-        cfg_.count("objective") != 0 &&
+    if (cfg_.count("max_delta_step") == 0 && cfg_.count("objective") != 0 &&
        cfg_["objective"] == "count:poisson") {
      cfg_["max_delta_step"] = "0.7";
    }

-    if (tparam.tree_method == 3) {
-      /* histogram-based algorithm */
-      LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a single updater "
-                   << "grow_fast_histmaker.";
-      cfg_["updater"] = "grow_fast_histmaker";
-    } else if (cfg_.count("updater") == 0) {
-      if (tparam.dsplit == 1) {
-        cfg_["updater"] = "distcol";
-      } else if (tparam.dsplit == 2) {
-        cfg_["updater"] = "grow_histmaker,prune";
-      }
-      if (tparam.prob_buffer_row != 1.0f) {
-        cfg_["updater"] = "grow_histmaker,refresh,prune";
-      }
-    }
+    ConfigureUpdaters();
+
    if (cfg_.count("objective") == 0) {
      cfg_["objective"] = "reg:linear";
    }
@ -220,9 +241,7 @@ class LearnerImpl : public Learner {
    }
  }

-  void InitModel() override {
-    this->LazyInitModel();
-  }
+  void InitModel() override { this->LazyInitModel(); }

  void Load(dmlc::Stream* fi) override {
    // TODO(tqchen) mark deprecation of old format.
@ -259,8 +278,7 @@ class LearnerImpl : public Learner {
            << "BoostLearner: wrong model format";
      }
    }
-    CHECK(fi->Read(&name_gbm_))
-        << "BoostLearner: wrong model format";
+    CHECK(fi->Read(&name_gbm_)) << "BoostLearner: wrong model format";
    // duplicated code with LazyInitModel
    obj_.reset(ObjFunction::Create(name_obj_));
    gbm_.reset(GradientBooster::Create(name_gbm_, cache_, mparam.base_score));
@ -268,8 +286,8 @@ class LearnerImpl : public Learner {
    if (mparam.contain_extra_attrs != 0) {
      std::vector<std::pair<std::string, std::string> > attr;
      fi->Read(&attr);
-      attributes_ = std::map<std::string, std::string>(
-          attr.begin(), attr.end());
+      attributes_ =
+          std::map<std::string, std::string>(attr.begin(), attr.end());
    }
    if (name_obj_ == "count:poisson") {
      std::string max_delta_step;
@ -300,9 +318,9 @@ class LearnerImpl : public Learner {
      fo->Write(attr);
    }
    if (name_obj_ == "count:poisson") {
-        std::map<std::string, std::string>::const_iterator it = cfg_.find("max_delta_step");
-        if (it != cfg_.end())
-            fo->Write(it->second);
+      std::map<std::string, std::string>::const_iterator it =
+          cfg_.find("max_delta_step");
+      if (it != cfg_.end()) fo->Write(it->second);
    }
    if (mparam.contain_eval_metrics != 0) {
      std::vector<std::string> metr;
@ -325,8 +343,7 @@ class LearnerImpl : public Learner {
    gbm_->DoBoost(train, &gpair_, obj_.get());
  }

-  void BoostOneIter(int iter,
-                    DMatrix* train,
+  void BoostOneIter(int iter, DMatrix* train,
                    std::vector<bst_gpair>* in_gpair) override {
    if (tparam.seed_per_iteration || rabit::IsDistributed()) {
      common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
@ -335,13 +352,11 @@ class LearnerImpl : public Learner {
    gbm_->DoBoost(train, in_gpair);
  }

-  std::string EvalOneIter(int iter,
-                          const std::vector<DMatrix*>& data_sets,
+  std::string EvalOneIter(int iter, const std::vector<DMatrix*>& data_sets,
                          const std::vector<std::string>& data_names) override {
    double tstart = dmlc::GetTime();
    std::ostringstream os;
-    os << '[' << iter << ']'
-       << std::setiosflags(std::ios::fixed);
+    os << '[' << iter << ']' << std::setiosflags(std::ios::fixed);
    if (metrics_.size() == 0) {
      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
    }
@ -388,20 +403,19 @@ class LearnerImpl : public Learner {
    return out;
  }

-  std::pair<std::string, bst_float> Evaluate(DMatrix* data, std::string metric) {
+  std::pair<std::string, bst_float> Evaluate(DMatrix* data,
+                                             std::string metric) {
    if (metric == "auto") metric = obj_->DefaultEvalMetric();
    std::unique_ptr<Metric> ev(Metric::Create(metric.c_str()));
    this->PredictRaw(data, &preds_);
    obj_->EvalTransform(&preds_);
-    return std::make_pair(metric, ev->Eval(preds_, data->info(), tparam.dsplit == 2));
+    return std::make_pair(metric,
+                          ev->Eval(preds_, data->info(), tparam.dsplit == 2));
  }

-  void Predict(DMatrix* data,
-               bool output_margin,
-               std::vector<bst_float> *out_preds,
-               unsigned ntree_limit,
-               bool pred_leaf,
-               bool pred_contribs) const override {
+  void Predict(DMatrix* data, bool output_margin,
+               std::vector<bst_float>* out_preds, unsigned ntree_limit,
+               bool pred_leaf, bool pred_contribs) const override {
    if (pred_contribs) {
      gbm_->PredictContribution(data, out_preds, ntree_limit);
    } else if (pred_leaf) {
@ -418,7 +432,12 @@ class LearnerImpl : public Learner {
  // check if p_train is ready to used by training.
  // if not, initialize the column access.
  inline void LazyInitDMatrix(DMatrix* p_train) {
-    if (tparam.tree_method != 3 && !p_train->HaveColAccess()) {
+    if (tparam.tree_method == 3 || tparam.tree_method == 4 ||
+        tparam.tree_method == 5) {
+      return;
+    }
+
+    if (!p_train->HaveColAccess()) {
      int ncol = static_cast<int>(p_train->info().num_col);
      std::vector<bool> enabled(ncol, true);
      // set max row per batch to limited value
@ -426,9 +445,9 @@ class LearnerImpl : public Learner {
      size_t max_row_perbatch = tparam.max_row_perbatch;
      const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);

-      if (tparam.tree_method == 0 &&
-          p_train->info().num_row >= (4UL << 20UL)) {
-        LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
+      if (tparam.tree_method == 0 && p_train->info().num_row >= (4UL << 20UL)) {
+        LOG(CONSOLE)
+            << "Tree method is automatically selected to be \'approx\'"
            << " for faster speed."
            << " to use old behavior(exact greedy algorithm on single machine),"
            << " set tree_method to \'exact\'";
@ -444,15 +463,14 @@ class LearnerImpl : public Learner {
        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
      }
      // initialize column access
-      p_train->InitColAccess(enabled,
-                             tparam.prob_buffer_row,
-                             max_row_perbatch);
+      p_train->InitColAccess(enabled, tparam.prob_buffer_row, max_row_perbatch);
    }

    if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
      if (tparam.tree_method == 2) {
        LOG(CONSOLE) << "tree method is set to be 'exact',"
-                     << " but currently we are only able to proceed with approximate algorithm";
+                     << " but currently we are only able to proceed with "
+                        "approximate algorithm";
      }
      cfg_["updater"] = "grow_histmaker,prune";
      if (gbm_.get() != nullptr) {
@ -462,9 +480,7 @@ class LearnerImpl : public Learner {
  }

  // return whether model is already initialized.
-  inline bool ModelInitialized() const {
-    return gbm_.get() != nullptr;
-  }
+  inline bool ModelInitialized() const { return gbm_.get() != nullptr; }
  // lazily initialize the model if it haven't yet been initialized.
  inline void LazyInitModel() {
    if (this->ModelInitialized()) return;
@ -497,14 +513,11 @@ class LearnerImpl : public Learner {
   * \param ntree_limit limit number of trees used for boosted tree
   *   predictor, when it equals 0, this means we are using all the trees
   */
-  inline void PredictRaw(DMatrix* data,
-                         std::vector<bst_float>* out_preds,
+  inline void PredictRaw(DMatrix* data, std::vector<bst_float>* out_preds,
                         unsigned ntree_limit = 0) const {
    CHECK(gbm_.get() != nullptr)
        << "Predict must happen after Load or InitModel";
-    gbm_->Predict(data,
-                  out_preds,
-                  ntree_limit);
+    gbm_->Predict(data, out_preds, ntree_limit);
  }
  // model parameter
  LearnerModelParam mparam;
@ -530,7 +543,8 @@ class LearnerImpl : public Learner {
  std::vector<std::shared_ptr<DMatrix> > cache_;
 };

-Learner* Learner::Create(const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
+Learner* Learner::Create(
+    const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
  return new LearnerImpl(cache_data);
 }
 }  // namespace xgboost