[GPU-Plugin] Change GPU plugin to use tree_method parameter, bump cmake version to 3.5 for GPU plugin, add compute architecture 3.5, remove unused cmake files (#2455)

2017-06-29 16:19:45 +12:00 · 2017-06-29 16:19:45 +12:00 · 48f3003302
commit 48f3003302
parent 88488fdbb9
8 changed files with 168 additions and 835 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,9 +3,10 @@ project (xgboost)
 find_package(OpenMP)
 option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
-set(GPU_COMPUTE_VER 50;52;60;61 CACHE STRING
+set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
    "Space separated list of compute versions to be built against")
 if(PLUGIN_UPDATER_GPU)
  cmake_minimum_required (VERSION 3.5)
  find_package(CUDA REQUIRED)
 endif()
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@ -1,289 +0,0 @@
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
 # Usage:
 #   mshadow_detect_installed_gpus(out_variable)
 function(xgboost_detect_installed_gpus out_variable)
 set(CUDA_gpu_detect_output "")
  if(NOT CUDA_gpu_detect_output)
    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
    file(WRITE ${__cufile} ""
      "#include <cstdio>\n"
      "int main()\n"
      "{\n"
      "  int count = 0;\n"
      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
      "  if (count == 0) return -1;\n"
      "  for (int device = 0; device < count; ++device)\n"
      "  {\n"
      "    cudaDeviceProp prop;\n"
      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
      "  }\n"
      "  return 0;\n"
      "}\n")
    if(MSVC)
      #find vcvarsall.bat and run it building msvc environment
      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
                      ERROR_QUIET
                      OUTPUT_STRIP_TRAILING_WHITESPACE)
    else()
      if(CUDA_LIBRARY_PATH)
        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
      endif()
      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
                      ERROR_QUIET
                      OUTPUT_STRIP_TRAILING_WHITESPACE)
    endif()
    if(__nvcc_res EQUAL 0)
      # nvcc outputs text containing line breaks when building with MSVC.
      # The line below prevents CMake from inserting a variable with line
      # breaks in the cache
      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from xgboost_detect_gpus tool" FORCE)
    else()
      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
    endif()
  endif()
  if(NOT CUDA_gpu_detect_output)
    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${xgboost_known_gpu_archs}).")
    set(${out_variable} ${xgboost_known_gpu_archs} PARENT_SCOPE)
  else()
    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
  endif()
 endfunction()
 ################################################################################################
 # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
 # Usage:
 #   xgboost_select_nvcc_arch_flags(out_variable)
 function(xgboost_select_nvcc_arch_flags out_variable)
  # List of arch names
  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
  set(__archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND __archs_names "Auto")
    set(__archs_name_default "Auto")
  endif()
  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
  mark_as_advanced(CUDA_ARCH_NAME)
  # verify CUDA_ARCH_NAME value
  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
    string(REPLACE ";" ", " __archs_names "${__archs_names}")
    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
  endif()
  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(CUDA_ARCH_BIN ${xgboost_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
  else()
    unset(CUDA_ARCH_BIN CACHE)
    unset(CUDA_ARCH_PTX CACHE)
  endif()
  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
    set(__cuda_arch_bin "20 21(20)")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
    set(__cuda_arch_bin "30 35")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
    set(__cuda_arch_bin "50")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
    set(__cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(__cuda_arch_bin ${xgboost_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
    xgboost_detect_installed_gpus(__cuda_arch_bin)
  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
  endif()
  # remove dots and convert to lists
  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
  xgboost_list_unique(__cuda_arch_bin __cuda_arch_ptx)
  set(__nvcc_flags "")
  set(__nvcc_archs_readable "")
  # Tell NVCC to add binaries for the specified GPUs
  foreach(__arch ${__cuda_arch_bin})
    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
      # User explicitly specified PTX for the concrete BIN
      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
    else()
      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
      list(APPEND __nvcc_archs_readable sm_${__arch})
    endif()
  endforeach()
  # Tell NVCC to add PTX intermediate code for the specified architectures
  foreach(__arch ${__cuda_arch_ptx})
    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
    list(APPEND __nvcc_archs_readable compute_${__arch})
  endforeach()
  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
 endfunction()
 ################################################################################################
 # Short command for cuda comnpilation
 # Usage:
 #   xgboost_cuda_compile(<objlist_variable> <cuda_files>)
 macro(xgboost_cuda_compile objlist_variable)
  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
    set(${var}_backup_in_cuda_compile_ "${${var}}")
    # we remove /EHa as it generates warnings under windows
    string(REPLACE "/EHa" "" ${var} "${${var}}")
  endforeach()
  if(UNIX OR APPLE)
    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
  endif()
  if(APPLE)
    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
  endif()
  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G -lineinfo")
  if(MSVC)
    # disable noisy warnings:
    # 4819: The file contains a character that cannot be represented in the current code page (number).
    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
    foreach(flag_var
        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
      if(${flag_var} MATCHES "/MD")
        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
      endif(${flag_var} MATCHES "/MD")
    endforeach(flag_var)
  endif()
  # If the build system is a container, make sure the nvcc intermediate files
  # go into the build output area rather than in /tmp, which may run out of space
  if(IS_CONTAINER_BUILD)
    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
  endif()
  cuda_compile(cuda_objcs ${ARGN})
  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
    set(${var} "${${var}_backup_in_cuda_compile_}")
    unset(${var}_backup_in_cuda_compile_)
  endforeach()
  set(${objlist_variable} ${cuda_objcs})
 endmacro()
 ################################################################################################
 ###  Non macro section
 ################################################################################################
 # Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
 if(NOT CUDA_TOOLKIT_ROOT_DIR)
  find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
  if(CUDA_LIBRARY_PATH)
    get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
    set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
  endif()
 endif()
 find_package(CUDA 5.5 QUIET REQUIRED)
 find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
 if(NOT CUDA_FOUND)
  return()
 endif()
 set(HAVE_CUDA TRUE)
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 list(APPEND xgboost_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
 # Known NVIDIA GPU achitectures xgboost can be compiled for.
 # This list will be used for CUDA_ARCH_NAME = All option
 if(CUDA_ARCH_ALL)
  set(xgboost_known_gpu_archs "${CUDA_ARCH_ALL}")
 else()
  if(${CUDA_VERSION} GREATER 7.5)
    set(xgboost_known_gpu_archs "30 35 50 52 60 61")
  else()
    set(xgboost_known_gpu_archs "30 35 50 52")
  endif()
 endif()
 # cudnn detection
 if(USE_CUDNN)
  detect_cuDNN()
  if(HAVE_CUDNN)
    add_definitions(-DUSE_CUDNN)
    include_directories(SYSTEM ${CUDNN_INCLUDE})
    list(APPEND xgboost_LINKER_LIBS ${CUDNN_LIBRARY})
  endif()
 endif()
 # setting nvcc arch flags
 xgboost_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
 # Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
 # https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
 if(Boost_VERSION EQUAL 105500)
  message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
  # avoid warning for CMake >= 2.8.12
  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
 endif()
 # disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
 foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
 endforeach()
 # setting default testing device
 if(NOT CUDA_TEST_DEVICE)
  set(CUDA_TEST_DEVICE -1)
 endif()
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
 # Handle clang/libc++ issue
 if(APPLE)
  xgboost_detect_darwin_version(OSX_VERSION)
  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
  if(OSX_VERSION VERSION_GREATER 10.8)
    # enabled by default if and only if CUDA version is less than 7.0
    xgboost_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
  endif()
 endif()
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -1,398 +0,0 @@
 ################################################################################################
 # Command alias for debugging messages
 # Usage:
 #   dmsg(<message>)
 function(dmsg)
  message(STATUS ${ARGN})
 endfunction()
 ################################################################################################
 # Removes duplicates from list(s)
 # Usage:
 #   xgboost_list_unique(<list_variable> [<list_variable>] [...])
 macro(xgboost_list_unique)
  foreach(__lst ${ARGN})
    if(${__lst})
      list(REMOVE_DUPLICATES ${__lst})
    endif()
  endforeach()
 endmacro()
 ################################################################################################
 # Clears variables from list
 # Usage:
 #   xgboost_clear_vars(<variables_list>)
 macro(xgboost_clear_vars)
  foreach(_var ${ARGN})
    unset(${_var})
  endforeach()
 endmacro()
 ################################################################################################
 # Removes duplicates from string
 # Usage:
 #   xgboost_string_unique(<string_variable>)
 function(xgboost_string_unique __string)
  if(${__string})
    set(__list ${${__string}})
    separate_arguments(__list)
    list(REMOVE_DUPLICATES __list)
    foreach(__e ${__list})
      set(__str "${__str} ${__e}")
    endforeach()
    set(${__string} ${__str} PARENT_SCOPE)
  endif()
 endfunction()
 ################################################################################################
 # Prints list element per line
 # Usage:
 #   xgboost_print_list(<list>)
 function(xgboost_print_list)
  foreach(e ${ARGN})
    message(STATUS ${e})
  endforeach()
 endfunction()
 ################################################################################################
 # Function merging lists of compiler flags to single string.
 # Usage:
 #   xgboost_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
 function(xgboost_merge_flag_lists out_var)
  set(__result "")
  foreach(__list ${ARGN})
    foreach(__flag ${${__list}})
      string(STRIP ${__flag} __flag)
      set(__result "${__result} ${__flag}")
    endforeach()
  endforeach()
  string(STRIP ${__result} __result)
  set(${out_var} ${__result} PARENT_SCOPE)
 endfunction()
 ################################################################################################
 # Converts all paths in list to absolute
 # Usage:
 #   xgboost_convert_absolute_paths(<list_variable>)
 function(xgboost_convert_absolute_paths variable)
  set(__dlist "")
  foreach(__s ${${variable}})
    get_filename_component(__abspath ${__s} ABSOLUTE)
    list(APPEND __list ${__abspath})
  endforeach()
  set(${variable} ${__list} PARENT_SCOPE)
 endfunction()
 ################################################################################################
 # Reads set of version defines from the header file
 # Usage:
 #   xgboost_parse_header(<file> <define1> <define2> <define3> ..)
 macro(xgboost_parse_header FILENAME FILE_VAR)
  set(vars_regex "")
  set(__parnet_scope OFF)
  set(__add_cache OFF)
  foreach(name ${ARGN})
    if("${name}" STREQUAL "PARENT_SCOPE")
      set(__parnet_scope ON)
    elseif("${name}" STREQUAL "CACHE")
      set(__add_cache ON)
    elseif(vars_regex)
      set(vars_regex "${vars_regex}|${name}")
    else()
      set(vars_regex "${name}")
    endif()
  endforeach()
  if(EXISTS "${FILENAME}")
    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
  else()
    unset(${FILE_VAR})
  endif()
  foreach(name ${ARGN})
    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
      if(${FILE_VAR})
        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
        else()
          set(${name} "")
        endif()
        if(__add_cache)
          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
        elseif(__parnet_scope)
          set(${name} "${${name}}" PARENT_SCOPE)
        endif()
      else()
        unset(${name} CACHE)
      endif()
    endif()
  endforeach()
 endmacro()
 ################################################################################################
 # Reads single version define from the header file and parses it
 # Usage:
 #   xgboost_parse_header_single_define(<library_name> <file> <define_name>)
 function(xgboost_parse_header_single_define LIBNAME HDR_PATH VARNAME)
  set(${LIBNAME}_H "")
  if(EXISTS "${HDR_PATH}")
    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
  endif()
  if(${LIBNAME}_H)
    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
    # append a TWEAK version if it exists:
    set(${LIBNAME}_VERSION_TWEAK "")
    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
    endif()
    if(${LIBNAME}_VERSION_TWEAK)
      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
    else()
      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
    endif()
  endif()
 endfunction()
 ########################################################################################################
 # An option that the user can select. Can accept condition to control when option is available for user.
 # Usage:
 #   xgboost_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
 function(xgboost_option variable description value)
  set(__value ${value})
  set(__condition "")
  set(__varname "__value")
  foreach(arg ${ARGN})
    if(arg STREQUAL "IF" OR arg STREQUAL "if")
      set(__varname "__condition")
    else()
      list(APPEND ${__varname} ${arg})
    endif()
  endforeach()
  unset(__varname)
  if("${__condition}" STREQUAL "")
    set(__condition 2 GREATER 1)
  endif()
  if(${__condition})
    if("${__value}" MATCHES ";")
      if(${__value})
        option(${variable} "${description}" ON)
      else()
        option(${variable} "${description}" OFF)
      endif()
    elseif(DEFINED ${__value})
      if(${__value})
        option(${variable} "${description}" ON)
      else()
        option(${variable} "${description}" OFF)
      endif()
    else()
      option(${variable} "${description}" ${__value})
    endif()
  else()
    unset(${variable} CACHE)
  endif()
 endfunction()
 ################################################################################################
 # Utility macro for comparing two lists. Used for CMake debugging purposes
 # Usage:
 #   xgboost_compare_lists(<list_variable> <list2_variable> [description])
 function(xgboost_compare_lists list1 list2 desc)
  set(__list1 ${${list1}})
  set(__list2 ${${list2}})
  list(SORT __list1)
  list(SORT __list2)
  list(LENGTH __list1 __len1)
  list(LENGTH __list2 __len2)
  if(NOT ${__len1} EQUAL ${__len2})
    message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
  endif()
  foreach(__i RANGE 1 ${__len1})
    math(EXPR __index "${__i}- 1")
    list(GET __list1 ${__index} __item1)
    list(GET __list2 ${__index} __item2)
    if(NOT ${__item1} STREQUAL ${__item2})
      message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
    endif()
  endforeach()
 endfunction()
 ################################################################################################
 # Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
 # Usage:
 #   xgboost_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
 macro(xgboost_warnings_disable)
  set(_flag_vars "")
  set(_msvc_warnings "")
  set(_gxx_warnings "")
  foreach(arg ${ARGN})
    if(arg MATCHES "^CMAKE_")
      list(APPEND _flag_vars ${arg})
    elseif(arg MATCHES "^/wd")
      list(APPEND _msvc_warnings ${arg})
    elseif(arg MATCHES "^-W")
      list(APPEND _gxx_warnings ${arg})
    endif()
  endforeach()
  if(NOT _flag_vars)
    set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
  endif()
  if(MSVC AND _msvc_warnings)
    foreach(var ${_flag_vars})
      foreach(warning ${_msvc_warnings})
        set(${var} "${${var}} ${warning}")
      endforeach()
    endforeach()
  elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
    foreach(var ${_flag_vars})
      foreach(warning ${_gxx_warnings})
        if(NOT warning MATCHES "^-Wno-")
          string(REPLACE "${warning}" "" ${var} "${${var}}")
          string(REPLACE "-W" "-Wno-" warning "${warning}")
        endif()
        set(${var} "${${var}} ${warning}")
      endforeach()
    endforeach()
  endif()
  xgboost_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
 endmacro()
 ################################################################################################
 # Helper function get current definitions
 # Usage:
 #   xgboost_get_current_definitions(<definitions_variable>)
 function(xgboost_get_current_definitions definitions_var)
  get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
  set(result "")
  foreach(d ${current_definitions})
    list(APPEND result -D${d})
  endforeach()
  xgboost_list_unique(result)
  set(${definitions_var} ${result} PARENT_SCOPE)
 endfunction()
 ################################################################################################
 # Helper function get current includes/definitions
 # Usage:
 #   xgboost_get_current_cflags(<cflagslist_variable>)
 function(xgboost_get_current_cflags cflags_var)
  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
  xgboost_convert_absolute_paths(current_includes)
  xgboost_get_current_definitions(cflags)
  foreach(i ${current_includes})
    list(APPEND cflags "-I${i}")
  endforeach()
  xgboost_list_unique(cflags)
  set(${cflags_var} ${cflags} PARENT_SCOPE)
 endfunction()
 ################################################################################################
 # Helper function to parse current linker libs into link directories, libflags and osx frameworks
 # Usage:
 #   xgboost_parse_linker_libs(<xgboost_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
 function(xgboost_parse_linker_libs xgboost_LINKER_LIBS_variable folders_var flags_var frameworks_var)
  set(__unspec "")
  set(__debug "")
  set(__optimized "")
  set(__framework "")
  set(__varname "__unspec")
  # split libs into debug, optimized, unspecified and frameworks
  foreach(list_elem ${${xgboost_LINKER_LIBS_variable}})
    if(list_elem STREQUAL "debug")
      set(__varname "__debug")
    elseif(list_elem STREQUAL "optimized")
      set(__varname "__optimized")
    elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
      list(APPEND __framework -framework ${CMAKE_MATCH_1})
    else()
      list(APPEND ${__varname} ${list_elem})
      set(__varname "__unspec")
    endif()
  endforeach()
  # attach debug or optimized libs to unspecified according to current configuration
  if(CMAKE_BUILD_TYPE MATCHES "Debug")
    set(__libs ${__unspec} ${__debug})
  else()
    set(__libs ${__unspec} ${__optimized})
  endif()
  set(libflags "")
  set(folders "")
  # convert linker libraries list to link flags
  foreach(lib ${__libs})
    if(TARGET ${lib})
      list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
      list(APPEND libflags -l${lib})
    elseif(lib MATCHES "^-l.*")
      list(APPEND libflags ${lib})
    elseif(IS_ABSOLUTE ${lib})
      get_filename_component(name_we ${lib} NAME_WE)
      get_filename_component(folder  ${lib} PATH)
      string(REGEX MATCH "^lib(.*)" __match ${name_we})
      list(APPEND libflags -l${CMAKE_MATCH_1})
      list(APPEND folders    ${folder})
    else()
      message(FATAL_ERROR "Logic error. Need to update cmake script")
    endif()
  endforeach()
  xgboost_list_unique(libflags folders)
  set(${folders_var} ${folders} PARENT_SCOPE)
  set(${flags_var} ${libflags} PARENT_SCOPE)
  set(${frameworks_var} ${__framework} PARENT_SCOPE)
 endfunction()
 ################################################################################################
 # Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
 # Usage:
 #   xgboost_detect_darwin_version(<version_variable>)
 function(xgboost_detect_darwin_version output_var)
  if(APPLE)
    execute_process(COMMAND /usr/bin/sw_vers -productVersion
                    RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
  else()
    set(${output_var} "" PARENT_SCOPE)
  endif()
 endfunction()
 ################################################################################################
 # Convenient command to setup source group for IDEs that support this feature (VS, XCode)
 # Usage:
 #   caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
 function(xgboost_source_group group)
  cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
  if(CAFFE_SOURCE_GROUP_GLOB)
    file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
    source_group(${group} FILES ${srcs1})
  endif()
  if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
    file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
    source_group(${group} FILES ${srcs2})
  endif()
 endfunction()
--- a/demo/gpu_acceleration/bosch.py
+++ b/demo/gpu_acceleration/bosch.py
@ -24,8 +24,7 @@ param['eval_metric'] = 'auc'
 param['max_depth'] = 5
 param['eta'] = 0.3
 param['silent'] = 0
-param['updater'] = 'grow_gpu'
+param['tree_method'] = 'gpu_exact'
 #param['updater'] = 'grow_colmaker'
 num_round = 20
--- a/plugin/updater_gpu/README.md
+++ b/plugin/updater_gpu/README.md
@ -1,16 +1,16 @@
 # CUDA Accelerated Tree Construction Algorithms
 This plugin adds GPU accelerated tree construction algorithms to XGBoost.
 ## Usage
-Specify the 'updater' parameter as one of the following algorithms. 
+Specify the 'tree_method' parameter as one of the following algorithms. 
 ### Algorithms
-| updater | Description |
+| tree_method | Description |
 | --- | --- |
-grow_gpu | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'grow_gpu_hist' |
+gpu_exact | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'gpu_hist' |
-grow_gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Faster and uses considerably less memory. Splits may be less accurate. |
+gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Faster and uses considerably less memory. Splits may be less accurate. |
 ### Supported parameters 
-| parameter | grow_gpu | grow_gpu_hist |
+| parameter | gpu_exact | gpu_hist |
 | --- | --- | --- |
 subsample | &#10004; | &#10004; |
 colsample_bytree | &#10004; | &#10004;|
@ -29,7 +29,7 @@ Python example:
 ```python
 param['gpu_id'] = 1
 param['max_bin'] = 16
-param['updater'] = 'grow_gpu_hist'
+param['tree_method'] = 'gpu_hist'
 ```
 ## Benchmarks
 To run benchmarks on synthetic data for binary classification:
@ -39,18 +39,18 @@ $ python benchmark/benchmark.py
 Training time time on 1000000 rows x 50 columns with 500 boosting iterations on i7-6700K CPU @ 4.00GHz and Pascal Titan X.
-| Updater | Time (s) |
+| tree_method | Time (s) |
 | --- | --- |
-| grow_gpu_hist | 11.09 |
+| gpu_hist | 11.09 |
-| grow_fast_histmaker (histogram XGBoost - CPU) | 41.75 |
+| hist (histogram XGBoost - CPU) | 41.75 |
-| grow_gpu | 193.90 |
+| gpu_exact | 193.90 |
-| grow_colmaker (standard XGBoost - CPU) | 720.12 |
+| exact (standard XGBoost - CPU) | 720.12 |
-[See here](http://dmlc.ml/2016/12/14/GPU-accelerated-xgboost.html) for additional performance benchmarks of the 'grow_gpu' updater.
+[See here](http://dmlc.ml/2016/12/14/GPU-accelerated-xgboost.html) for additional performance benchmarks of the 'gpu_exact' tree_method.
 ## Test
-To run tests:
+To run tests:Will
 ```bash
 $ python -m nose test/python/
 ```
@ -122,6 +122,13 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
 ```
 ## Changelog
 ##### 2017/6/26
 * Change API to use tree_method parameter
 * Increase required cmake version to 3.5
 * Add compute arch 3.5 to default archs
 * Set default n_gpus to 1
 ##### 2017/6/5
 * Multi-GPU support for histogram method using NVIDIA NCCL.
--- a/plugin/updater_gpu/benchmark/benchmark.py
+++ b/plugin/updater_gpu/benchmark/benchmark.py
@ -14,19 +14,18 @@ def run_benchmark(args, gpu_algorithm, cpu_algorithm):
    dtrain = xgb.DMatrix(X, y)
    param = {'objective': 'binary:logistic',
             'tree_method': 'exact',
             'max_depth': 6,
             'silent': 1,
             'eval_metric': 'auc'}
-    param['updater'] = gpu_algorithm
+    param['tree_method'] = gpu_algorithm
-    print("Training with '%s'" % param['updater'])
+    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
    print ("Time: %s seconds" % (str(time.time() - tmp)))
-    param['updater'] = cpu_algorithm
+    param['tree_method'] = cpu_algorithm
-    print("Training with '%s'" % param['updater'])
+    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
    print ("Time: %s seconds" % (str(time.time() - tmp)))
@ -34,17 +33,17 @@ def run_benchmark(args, gpu_algorithm, cpu_algorithm):
 parser = argparse.ArgumentParser()
-parser.add_argument('--algorithm', choices=['all', 'grow_gpu', 'grow_gpu_hist'], required=True)
+parser.add_argument('--algorithm', choices=['all', 'gpu_exact', 'gpu_hist'], default='all')
 parser.add_argument('--rows',type=int,default=1000000)
 parser.add_argument('--columns',type=int,default=50)
 parser.add_argument('--iterations',type=int,default=500)
 args = parser.parse_args()
-if 'grow_gpu_hist' in args.algorithm:
+if 'gpu_hist' in args.algorithm:
-    run_benchmark(args, args.algorithm, 'grow_fast_histmaker')
+    run_benchmark(args, args.algorithm, 'hist')
-if 'grow_gpu' in args.algorithm:
+if 'gpu_exact' in args.algorithm:
-    run_benchmark(args, args.algorithm, 'grow_colmaker')
+    run_benchmark(args, args.algorithm, 'exact')
 if 'all' in args.algorithm:
-    run_benchmark(args, 'grow_gpu', 'grow_colmaker')
+    run_benchmark(args, 'gpu_exact', 'exact')
-    run_benchmark(args, 'grow_gpu_hist', 'grow_fast_histmaker')
+    run_benchmark(args, 'gpu_hist', 'hist')
--- a/plugin/updater_gpu/test/python/test.py
+++ b/plugin/updater_gpu/test/python/test.py
@ -35,7 +35,7 @@ class TestGPU(unittest.TestCase):
                    'objective': 'binary:logistic',
                    'eval_metric': 'auc'}
        ag_param2 = {'max_depth': 2,
-                     'updater': 'grow_gpu',
+                     'tree_method': 'gpu_exact',
                     'eta': 1,
                     'silent': 1,
                     'objective': 'binary:logistic',
@ -59,7 +59,7 @@ class TestGPU(unittest.TestCase):
        dtest = xgb.DMatrix(X_test, y_test)
        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu',
+                 'tree_method': 'gpu_exact',
                 'max_depth': 3,
                 'eval_metric': 'auc'}
        res = {}
@ -75,7 +75,7 @@ class TestGPU(unittest.TestCase):
        dtrain2 = xgb.DMatrix(X2, label=y2)
        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu',
+                 'tree_method': 'gpu_exact',
                 'max_depth': 2,
                 'eval_metric': 'auc'}
        res = {}
@ -134,7 +134,7 @@ class TestGPU(unittest.TestCase):
                            'objective': 'binary:logistic',
                            'eval_metric': 'auc'}
                ag_param2 = {'max_depth': max_depth,
-                             'updater': 'grow_gpu_hist',
+                             'tree_method': 'gpu_hist',
                             'eta': 1,
                             'silent': 1,
                             'n_gpus': 1,
@ -142,7 +142,7 @@ class TestGPU(unittest.TestCase):
                                 'max_bin': max_bin,
                             'eval_metric': 'auc'}
                ag_param3 = {'max_depth': max_depth,
-                             'updater': 'grow_gpu_hist',
+                             'tree_method': 'gpu_hist',
                             'eta': 1,
                             'silent': 1,
                             'n_gpus': n_gpus,
@ -177,7 +177,7 @@ class TestGPU(unittest.TestCase):
                dtest = xgb.DMatrix(X_test, y_test)
                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': 1,
                         'max_bin': max_bin,
@ -189,7 +189,7 @@ class TestGPU(unittest.TestCase):
                assert self.non_decreasing(res['train']['auc'])
                #assert self.non_decreasing(res['test']['auc'])
                param2 = {'objective': 'binary:logistic',
-                          'updater': 'grow_gpu_hist',
+                          'tree_method': 'gpu_hist',
                          'max_depth': max_depth,
                          'n_gpus': n_gpus,
                          'max_bin': max_bin,
@ -211,7 +211,7 @@ class TestGPU(unittest.TestCase):
                dtrain2 = xgb.DMatrix(X2, label=y2)
                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
                         'max_bin': max_bin,
@ -250,7 +250,7 @@ class TestGPU(unittest.TestCase):
                ######################################################################
                # fail-safe test for max_bin
                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
                         'eval_metric': 'auc',
@ -263,7 +263,7 @@ class TestGPU(unittest.TestCase):
                ######################################################################
                # subsampling
                param = {'objective': 'binary:logistic',
-                         'updater': 'grow_gpu_hist',
+                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
                         'eval_metric': 'auc',
@ -279,7 +279,7 @@ class TestGPU(unittest.TestCase):
        ######################################################################
        # fail-safe test for max_bin=2
        param = {'objective': 'binary:logistic',
-                 'updater': 'grow_gpu_hist',
+                 'tree_method': 'gpu_hist',
                 'max_depth': 2,
                 'n_gpus': n_gpus,
                 'eval_metric': 'auc',
--- a/src/learner.cc
+++ b/src/learner.cc
@ -4,19 +4,19 @@
 * \brief Implementation of learning algorithm.
 * \author Tianqi Chen
 */
 #include <xgboost/logging.h>
 #include <xgboost/learner.h>
 #include <dmlc/timer.h>
 #include <dmlc/io.h>
 #include <dmlc/timer.h>
 #include <xgboost/learner.h>
 #include <xgboost/logging.h>
 #include <algorithm>
 #include <vector>
 #include <utility>
 #include <string>
 #include <sstream>
 #include <limits>
 #include <iomanip>
-#include "./common/io.h"
+#include <limits>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 #include "./common/common.h"
 #include "./common/io.h"
 #include "./common/random.h"
 namespace xgboost {
@ -25,17 +25,14 @@ bool Learner::AllowLazyCheckPoint() const {
  return gbm_->AllowLazyCheckPoint();
 }
-std::vector<std::string>
+std::vector<std::string> Learner::DumpModel(const FeatureMap& fmap,
-Learner::DumpModel(const FeatureMap& fmap,
+                                            bool with_stats,
-                   bool with_stats,
+                                            std::string format) const {
                   std::string format) const {
  return gbm_->DumpModel(fmap, with_stats, format);
 }
 /*! \brief training parameter for regression */
-struct LearnerModelParam
+struct LearnerModelParam : public dmlc::Parameter<LearnerModelParam> {
    : public dmlc::Parameter<LearnerModelParam> {
  /* \brief global bias */
  bst_float base_score;
  /* \brief number of features  */
@ -55,20 +52,21 @@ struct LearnerModelParam
  }
  // declare parameters
  DMLC_DECLARE_PARAMETER(LearnerModelParam) {
-    DMLC_DECLARE_FIELD(base_score).set_default(0.5f)
+    DMLC_DECLARE_FIELD(base_score)
        .set_default(0.5f)
        .describe("Global bias of the model.");
-    DMLC_DECLARE_FIELD(num_feature).set_default(0)
+    DMLC_DECLARE_FIELD(num_feature)
-        .describe("Number of features in training data,"\
+        .set_default(0)
-                  " this parameter will be automatically detected by learner.");
+        .describe(
-    DMLC_DECLARE_FIELD(num_class).set_default(0).set_lower_bound(0)
+            "Number of features in training data,"
-        .describe("Number of class option for multi-class classifier. "\
+            " this parameter will be automatically detected by learner.");
-                  " By default equals 0 and corresponds to binary classifier.");
+    DMLC_DECLARE_FIELD(num_class).set_default(0).set_lower_bound(0).describe(
        "Number of class option for multi-class classifier. "
        " By default equals 0 and corresponds to binary classifier.");
  }
 };
-
+struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
 struct LearnerTrainParam
    : public dmlc::Parameter<LearnerTrainParam> {
  // stored random seed
  int seed;
  // whether seed the PRNG each iteration
@ -90,30 +88,40 @@ struct LearnerTrainParam
  int debug_verbose;
  // declare parameters
  DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
-    DMLC_DECLARE_FIELD(seed).set_default(0)
+    DMLC_DECLARE_FIELD(seed).set_default(0).describe(
-        .describe("Random number seed during training.");
+        "Random number seed during training.");
-    DMLC_DECLARE_FIELD(seed_per_iteration).set_default(false)
+    DMLC_DECLARE_FIELD(seed_per_iteration)
-        .describe("Seed PRNG determnisticly via iterator number, "\
+        .set_default(false)
-                  "this option will be switched on automatically on distributed mode.");
+        .describe(
-    DMLC_DECLARE_FIELD(dsplit).set_default(0)
+            "Seed PRNG determnisticly via iterator number, "
            "this option will be switched on automatically on distributed "
            "mode.");
    DMLC_DECLARE_FIELD(dsplit)
        .set_default(0)
        .add_enum("auto", 0)
        .add_enum("col", 1)
        .add_enum("row", 2)
        .describe("Data split mode for distributed training.");
-    DMLC_DECLARE_FIELD(tree_method).set_default(0)
+    DMLC_DECLARE_FIELD(tree_method)
        .set_default(0)
        .add_enum("auto", 0)
        .add_enum("approx", 1)
        .add_enum("exact", 2)
        .add_enum("hist", 3)
        .add_enum("gpu_exact", 4)
        .add_enum("gpu_hist", 5)
        .describe("Choice of tree construction method.");
-    DMLC_DECLARE_FIELD(test_flag).set_default("")
+    DMLC_DECLARE_FIELD(test_flag).set_default("").describe(
-        .describe("Internal test flag");
+        "Internal test flag");
-    DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
+    DMLC_DECLARE_FIELD(prob_buffer_row)
        .set_default(1.0f)
        .set_range(0.0f, 1.0f)
        .describe("Maximum buffered row portion");
-    DMLC_DECLARE_FIELD(max_row_perbatch).set_default(std::numeric_limits<size_t>::max())
+    DMLC_DECLARE_FIELD(max_row_perbatch)
        .set_default(std::numeric_limits<size_t>::max())
        .describe("maximum row per batch.");
-    DMLC_DECLARE_FIELD(nthread).set_default(0)
+    DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
-        .describe("Number of threads to use.");
+        "Number of threads to use.");
    DMLC_DECLARE_FIELD(debug_verbose)
        .set_lower_bound(0)
        .set_default(0)
@ -125,8 +133,8 @@ DMLC_REGISTER_PARAMETER(LearnerModelParam);
 DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 /*!
- * \brief learner that performs gradient boosting for a specific objective function.
+ * \brief learner that performs gradient boosting for a specific objective
- *  It does training and prediction.
+ * function. It does training and prediction.
 */
 class LearnerImpl : public Learner {
 public:
@ -137,14 +145,41 @@ class LearnerImpl : public Learner {
    name_gbm_ = "gbtree";
  }
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+  void ConfigureUpdaters() {
    if (tparam.tree_method == 0 || tparam.tree_method == 1 ||
        tparam.tree_method == 2) {
      if (cfg_.count("updater") == 0) {
        if (tparam.dsplit == 1) {
          cfg_["updater"] = "distcol";
        } else if (tparam.dsplit == 2) {
          cfg_["updater"] = "grow_histmaker,prune";
        }
        if (tparam.prob_buffer_row != 1.0f) {
          cfg_["updater"] = "grow_histmaker,refresh,prune";
        }
      }
    } else if (tparam.tree_method == 3) {
      /* histogram-based algorithm */
      LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a "
                      "single updater "
                   << "grow_fast_histmaker.";
      cfg_["updater"] = "grow_fast_histmaker";
    } else if (tparam.tree_method == 4) {
      cfg_["updater"] = "grow_gpu,prune";
    } else if (tparam.tree_method == 5) {
      cfg_["updater"] = "grow_gpu_hist";
    }
  }
  void Configure(
      const std::vector<std::pair<std::string, std::string> >& args) override {
    // add to configurations
    tparam.InitAllowUnknown(args);
    cfg_.clear();
    for (const auto& kv : args) {
      if (kv.first == "eval_metric") {
        // check duplication
-        auto dup_check = [&kv](const std::unique_ptr<Metric>&m) {
+        auto dup_check = [&kv](const std::unique_ptr<Metric>& m) {
          return m->Name() != kv.second;
        };
        if (std::all_of(metrics_.begin(), metrics_.end(), dup_check)) {
@ -172,27 +207,13 @@ class LearnerImpl : public Learner {
      }
    }
-    if (cfg_.count("max_delta_step") == 0 &&
+    if (cfg_.count("max_delta_step") == 0 && cfg_.count("objective") != 0 &&
        cfg_.count("objective") != 0 &&
        cfg_["objective"] == "count:poisson") {
      cfg_["max_delta_step"] = "0.7";
    }
-    if (tparam.tree_method == 3) {
+    ConfigureUpdaters();
-      /* histogram-based algorithm */
+
      LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a single updater "
                   << "grow_fast_histmaker.";
      cfg_["updater"] = "grow_fast_histmaker";
    } else if (cfg_.count("updater") == 0) {
      if (tparam.dsplit == 1) {
        cfg_["updater"] = "distcol";
      } else if (tparam.dsplit == 2) {
        cfg_["updater"] = "grow_histmaker,prune";
      }
      if (tparam.prob_buffer_row != 1.0f) {
        cfg_["updater"] = "grow_histmaker,refresh,prune";
      }
    }
    if (cfg_.count("objective") == 0) {
      cfg_["objective"] = "reg:linear";
    }
@ -220,9 +241,7 @@ class LearnerImpl : public Learner {
    }
  }
-  void InitModel() override {
+  void InitModel() override { this->LazyInitModel(); }
    this->LazyInitModel();
  }
  void Load(dmlc::Stream* fi) override {
    // TODO(tqchen) mark deprecation of old format.
@ -256,11 +275,10 @@ class LearnerImpl : public Learner {
      if (len != 0) {
        name_obj_.resize(len);
        CHECK_EQ(fi->Read(&name_obj_[0], len), len)
-            <<"BoostLearner: wrong model format";
+            << "BoostLearner: wrong model format";
      }
    }
-    CHECK(fi->Read(&name_gbm_))
+    CHECK(fi->Read(&name_gbm_)) << "BoostLearner: wrong model format";
        << "BoostLearner: wrong model format";
    // duplicated code with LazyInitModel
    obj_.reset(ObjFunction::Create(name_obj_));
    gbm_.reset(GradientBooster::Create(name_gbm_, cache_, mparam.base_score));
@ -268,13 +286,13 @@ class LearnerImpl : public Learner {
    if (mparam.contain_extra_attrs != 0) {
      std::vector<std::pair<std::string, std::string> > attr;
      fi->Read(&attr);
-      attributes_ = std::map<std::string, std::string>(
+      attributes_ =
-          attr.begin(), attr.end());
+          std::map<std::string, std::string>(attr.begin(), attr.end());
    }
    if (name_obj_ == "count:poisson") {
-        std::string max_delta_step;
+      std::string max_delta_step;
-        fi->Read(&max_delta_step);
+      fi->Read(&max_delta_step);
-        cfg_["max_delta_step"] = max_delta_step;
+      cfg_["max_delta_step"] = max_delta_step;
    }
    if (mparam.contain_eval_metrics != 0) {
      std::vector<std::string> metr;
@ -289,7 +307,7 @@ class LearnerImpl : public Learner {
  }
  // rabit save model to rabit checkpoint
-  void Save(dmlc::Stream *fo) const override {
+  void Save(dmlc::Stream* fo) const override {
    fo->Write(&mparam, sizeof(LearnerModelParam));
    fo->Write(name_obj_);
    fo->Write(name_gbm_);
@ -300,9 +318,9 @@ class LearnerImpl : public Learner {
      fo->Write(attr);
    }
    if (name_obj_ == "count:poisson") {
-        std::map<std::string, std::string>::const_iterator it = cfg_.find("max_delta_step");
+      std::map<std::string, std::string>::const_iterator it =
-        if (it != cfg_.end())
+          cfg_.find("max_delta_step");
-            fo->Write(it->second);
+      if (it != cfg_.end()) fo->Write(it->second);
    }
    if (mparam.contain_eval_metrics != 0) {
      std::vector<std::string> metr;
@ -325,8 +343,7 @@ class LearnerImpl : public Learner {
    gbm_->DoBoost(train, &gpair_, obj_.get());
  }
-  void BoostOneIter(int iter,
+  void BoostOneIter(int iter, DMatrix* train,
                    DMatrix* train,
                    std::vector<bst_gpair>* in_gpair) override {
    if (tparam.seed_per_iteration || rabit::IsDistributed()) {
      common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
@ -335,13 +352,11 @@ class LearnerImpl : public Learner {
    gbm_->DoBoost(train, in_gpair);
  }
-  std::string EvalOneIter(int iter,
+  std::string EvalOneIter(int iter, const std::vector<DMatrix*>& data_sets,
                          const std::vector<DMatrix*>& data_sets,
                          const std::vector<std::string>& data_names) override {
    double tstart = dmlc::GetTime();
    std::ostringstream os;
-    os << '[' << iter << ']'
+    os << '[' << iter << ']' << std::setiosflags(std::ios::fixed);
       << std::setiosflags(std::ios::fixed);
    if (metrics_.size() == 0) {
      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
    }
@ -388,20 +403,19 @@ class LearnerImpl : public Learner {
    return out;
  }
-  std::pair<std::string, bst_float> Evaluate(DMatrix* data, std::string metric) {
+  std::pair<std::string, bst_float> Evaluate(DMatrix* data,
                                             std::string metric) {
    if (metric == "auto") metric = obj_->DefaultEvalMetric();
    std::unique_ptr<Metric> ev(Metric::Create(metric.c_str()));
    this->PredictRaw(data, &preds_);
    obj_->EvalTransform(&preds_);
-    return std::make_pair(metric, ev->Eval(preds_, data->info(), tparam.dsplit == 2));
+    return std::make_pair(metric,
                          ev->Eval(preds_, data->info(), tparam.dsplit == 2));
  }
-  void Predict(DMatrix* data,
+  void Predict(DMatrix* data, bool output_margin,
-               bool output_margin,
+               std::vector<bst_float>* out_preds, unsigned ntree_limit,
-               std::vector<bst_float> *out_preds,
+               bool pred_leaf, bool pred_contribs) const override {
               unsigned ntree_limit,
               bool pred_leaf,
               bool pred_contribs) const override {
    if (pred_contribs) {
      gbm_->PredictContribution(data, out_preds, ntree_limit);
    } else if (pred_leaf) {
@ -418,7 +432,12 @@ class LearnerImpl : public Learner {
  // check if p_train is ready to used by training.
  // if not, initialize the column access.
  inline void LazyInitDMatrix(DMatrix* p_train) {
-    if (tparam.tree_method != 3 && !p_train->HaveColAccess()) {
+    if (tparam.tree_method == 3 || tparam.tree_method == 4 ||
        tparam.tree_method == 5) {
      return;
    }
    if (!p_train->HaveColAccess()) {
      int ncol = static_cast<int>(p_train->info().num_col);
      std::vector<bool> enabled(ncol, true);
      // set max row per batch to limited value
@ -426,12 +445,12 @@ class LearnerImpl : public Learner {
      size_t max_row_perbatch = tparam.max_row_perbatch;
      const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);
-      if (tparam.tree_method == 0 &&
+      if (tparam.tree_method == 0 && p_train->info().num_row >= (4UL << 20UL)) {
-          p_train->info().num_row >= (4UL << 20UL)) {
+        LOG(CONSOLE)
-        LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
+            << "Tree method is automatically selected to be \'approx\'"
-                     << " for faster speed."
+            << " for faster speed."
-                     << " to use old behavior(exact greedy algorithm on single machine),"
+            << " to use old behavior(exact greedy algorithm on single machine),"
-                     << " set tree_method to \'exact\'";
+            << " set tree_method to \'exact\'";
        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
      }
@ -444,15 +463,14 @@ class LearnerImpl : public Learner {
        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
      }
      // initialize column access
-      p_train->InitColAccess(enabled,
+      p_train->InitColAccess(enabled, tparam.prob_buffer_row, max_row_perbatch);
                             tparam.prob_buffer_row,
                             max_row_perbatch);
    }
    if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
      if (tparam.tree_method == 2) {
        LOG(CONSOLE) << "tree method is set to be 'exact',"
-                     << " but currently we are only able to proceed with approximate algorithm";
+                     << " but currently we are only able to proceed with "
                        "approximate algorithm";
      }
      cfg_["updater"] = "grow_histmaker,prune";
      if (gbm_.get() != nullptr) {
@ -462,9 +480,7 @@ class LearnerImpl : public Learner {
  }
  // return whether model is already initialized.
-  inline bool ModelInitialized() const {
+  inline bool ModelInitialized() const { return gbm_.get() != nullptr; }
    return gbm_.get() != nullptr;
  }
  // lazily initialize the model if it haven't yet been initialized.
  inline void LazyInitModel() {
    if (this->ModelInitialized()) return;
@ -497,14 +513,11 @@ class LearnerImpl : public Learner {
   * \param ntree_limit limit number of trees used for boosted tree
   *   predictor, when it equals 0, this means we are using all the trees
   */
-  inline void PredictRaw(DMatrix* data,
+  inline void PredictRaw(DMatrix* data, std::vector<bst_float>* out_preds,
                         std::vector<bst_float>* out_preds,
                         unsigned ntree_limit = 0) const {
    CHECK(gbm_.get() != nullptr)
        << "Predict must happen after Load or InitModel";
-    gbm_->Predict(data,
+    gbm_->Predict(data, out_preds, ntree_limit);
                  out_preds,
                  ntree_limit);
  }
  // model parameter
  LearnerModelParam mparam;
@ -530,7 +543,8 @@ class LearnerImpl : public Learner {
  std::vector<std::shared_ptr<DMatrix> > cache_;
 };
-Learner* Learner::Create(const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
+Learner* Learner::Create(
    const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
  return new LearnerImpl(cache_data);
 }
 }  // namespace xgboost