[GPU-Plugin] Multi-GPU for grow_gpu_hist histogram method using NVIDIA NCCL. (#2395)

This commit is contained in:
PSEUDOTENSOR / Jonathan McKinney 2017-06-11 13:06:08 -04:00 committed by Rory Mitchell
parent e24f25e0c6
commit 41efe32aa5
19 changed files with 2009 additions and 682 deletions

3
.gitmodules vendored
View File

@ -4,6 +4,9 @@
[submodule "rabit"]
path = rabit
url = https://github.com/dmlc/rabit
[submodule "nccl"]
path = nccl
url = https://github.com/dmlc/nccl
[submodule "cub"]
path = cub
url = https://github.com/NVlabs/cub

View File

@ -94,40 +94,58 @@ if(MSVC)
else()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
#Prevent shared library being called liblibxgboost.so on Linux
set(CMAKE_SHARED_LIBRARY_PREFIX "")
endif()
set(LINK_LIBRARIES dmlccore rabit)
if(PLUGIN_UPDATER_GPU)
#Find cub
set(CUB_DIRECTORY "cub/" CACHE PATH "CUB 1.5.4 directory")
include_directories(${CUB_DIRECTORY})
# nccl
set(LINK_LIBRARIES ${LINK_LIBRARIES} nccl)
add_subdirectory(nccl)
set(NCCL_DIRECTORY ${PROJECT_SOURCE_DIR}/nccl)
include_directories(${NCCL_DIRECTORY}/src)
set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
#Find cub
set(CUB_DIRECTORY ${PROJECT_SOURCE_DIR}/cub/)
include_directories(${CUB_DIRECTORY})
#Find googletest
set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
include_directories(${GTEST_DIRECTORY}/include)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_60;-lineinfo;")
if(NOT MSVC)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
endif()
# plugin
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lineinfo;--expt-extended-lambda")
set(CUDA_SOURCES
plugin/updater_gpu/src/updater_gpu.cu
plugin/updater_gpu/src/gpu_hist_builder.cu
)
cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
)
include(${PROJECT_SOURCE_DIR}/cmake/Utils.cmake)
include(${PROJECT_SOURCE_DIR}/cmake/Cuda.cmake)
# use below for forcing specific arch
#cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS} -arch=compute_52)
# use below for auto-detect, but gpu_grow currently doesn't work with 61
xgboost_cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
if(MSVC)
else()
cuda_add_library(updater_gpu STATIC ${CUDA_SOURCES})
set(LINK_LIBRARIES ${LINK_LIBRARIES} updater_gpu)
endif()
else()
set(CUDA_OBJS "")
set(updater_gpu "")
endif()
add_library(objxgboost OBJECT ${SOURCES})
set_target_properties(${objxgboost} PROPERTIES POSITION_INDEPENDENT_CODE 1)
add_executable(runxgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
target_link_libraries(runxgboost ${LINK_LIBRARIES})
add_library(libxgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
add_executable(xgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
add_library(xgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
target_link_libraries(xgboost ${LINK_LIBRARIES})
target_link_libraries(libxgboost ${LINK_LIBRARIES})
option(JVM_BINDINGS "Build JVM bindings" OFF)
@ -136,11 +154,11 @@ if(JVM_BINDINGS)
include_directories(${JNI_INCLUDE_DIRS} jvm-packages/xgboost4j/src/native)
add_library(xgboost4j SHARED
add_library(libxgboost4j SHARED
$<TARGET_OBJECTS:objxgboost>
${CUDA_OBJS}
jvm-packages/xgboost4j/src/native/xgboost4j.cpp)
target_link_libraries(xgboost4j
target_link_libraries(libxgboost4j
${LINK_LIBRARIES}
${JNI_LIBRARIES})
endif()

289
cmake/Cuda.cmake Normal file
View File

@ -0,0 +1,289 @@
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)
################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
# mshadow_detect_installed_gpus(out_variable)
function(xgboost_detect_installed_gpus out_variable)
set(CUDA_gpu_detect_output "")
if(NOT CUDA_gpu_detect_output)
set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
file(WRITE ${__cufile} ""
"#include <cstdio>\n"
"int main()\n"
"{\n"
" int count = 0;\n"
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
" if (count == 0) return -1;\n"
" for (int device = 0; device < count; ++device)\n"
" {\n"
" cudaDeviceProp prop;\n"
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
" std::printf(\"%d.%d \", prop.major, prop.minor);\n"
" }\n"
" return 0;\n"
"}\n")
if(MSVC)
#find vcvarsall.bat and run it building msvc environment
get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile}
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
else()
if(CUDA_LIBRARY_PATH)
set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
endif()
execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
if(__nvcc_res EQUAL 0)
# nvcc outputs text containing line breaks when building with MSVC.
# The line below prevents CMake from inserting a variable with line
# breaks in the cache
string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from xgboost_detect_gpus tool" FORCE)
else()
message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
endif()
endif()
if(NOT CUDA_gpu_detect_output)
message(WARNING "Automatic GPU detection failed. Building for all known architectures (${xgboost_known_gpu_archs}).")
set(${out_variable} ${xgboost_known_gpu_archs} PARENT_SCOPE)
else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif()
endfunction()
################################################################################################
# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
# Usage:
# xgboost_select_nvcc_arch_flags(out_variable)
function(xgboost_select_nvcc_arch_flags out_variable)
# List of arch names
set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
set(__archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND __archs_names "Auto")
set(__archs_name_default "Auto")
endif()
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
mark_as_advanced(CUDA_ARCH_NAME)
# verify CUDA_ARCH_NAME value
if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
string(REPLACE ";" ", " __archs_names "${__archs_names}")
message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
set(CUDA_ARCH_BIN ${xgboost_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
else()
unset(CUDA_ARCH_BIN CACHE)
unset(CUDA_ARCH_PTX CACHE)
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
set(__cuda_arch_bin "20 21(20)")
elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(__cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(__cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(__cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(__cuda_arch_bin ${xgboost_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
xgboost_detect_installed_gpus(__cuda_arch_bin)
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
set(__cuda_arch_bin ${CUDA_ARCH_BIN})
endif()
# remove dots and convert to lists
string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}")
xgboost_list_unique(__cuda_arch_bin __cuda_arch_ptx)
set(__nvcc_flags "")
set(__nvcc_archs_readable "")
# Tell NVCC to add binaries for the specified GPUs
foreach(__arch ${__cuda_arch_bin})
if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
list(APPEND __nvcc_archs_readable sm_${__arch})
endif()
endforeach()
# Tell NVCC to add PTX intermediate code for the specified architectures
foreach(__arch ${__cuda_arch_ptx})
list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
list(APPEND __nvcc_archs_readable compute_${__arch})
endforeach()
string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
set(${out_variable} ${__nvcc_flags} PARENT_SCOPE)
set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
endfunction()
################################################################################################
# Short command for cuda comnpilation
# Usage:
# xgboost_cuda_compile(<objlist_variable> <cuda_files>)
macro(xgboost_cuda_compile objlist_variable)
foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
set(${var}_backup_in_cuda_compile_ "${${var}}")
# we remove /EHa as it generates warnings under windows
string(REPLACE "/EHa" "" ${var} "${${var}}")
endforeach()
if(UNIX OR APPLE)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
endif()
if(APPLE)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
endif()
set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G -lineinfo")
if(MSVC)
# disable noisy warnings:
# 4819: The file contains a character that cannot be represented in the current code page (number).
list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endif()
# If the build system is a container, make sure the nvcc intermediate files
# go into the build output area rather than in /tmp, which may run out of space
if(IS_CONTAINER_BUILD)
set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
endif()
cuda_compile(cuda_objcs ${ARGN})
foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
set(${var} "${${var}_backup_in_cuda_compile_}")
unset(${var}_backup_in_cuda_compile_)
endforeach()
set(${objlist_variable} ${cuda_objcs})
endmacro()
################################################################################################
### Non macro section
################################################################################################
# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
if(NOT CUDA_TOOLKIT_ROOT_DIR)
find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
if(CUDA_LIBRARY_PATH)
get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
endif()
endif()
find_package(CUDA 5.5 QUIET REQUIRED)
find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand
if(NOT CUDA_FOUND)
return()
endif()
set(HAVE_CUDA TRUE)
message(STATUS "CUDA detected: " ${CUDA_VERSION})
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
list(APPEND xgboost_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
# Known NVIDIA GPU achitectures xgboost can be compiled for.
# This list will be used for CUDA_ARCH_NAME = All option
if(CUDA_ARCH_ALL)
set(xgboost_known_gpu_archs "${CUDA_ARCH_ALL}")
else()
if(${CUDA_VERSION} GREATER 7.5)
set(xgboost_known_gpu_archs "30 35 50 52 60 61")
else()
set(xgboost_known_gpu_archs "30 35 50 52")
endif()
endif()
# cudnn detection
if(USE_CUDNN)
detect_cuDNN()
if(HAVE_CUDNN)
add_definitions(-DUSE_CUDNN)
include_directories(SYSTEM ${CUDNN_INCLUDE})
list(APPEND xgboost_LINKER_LIBS ${CUDNN_LIBRARY})
endif()
endif()
# setting nvcc arch flags
xgboost_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
if(Boost_VERSION EQUAL 105500)
message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
# avoid warning for CMake >= 2.8.12
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
endif()
# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
endforeach()
# setting default testing device
if(NOT CUDA_TEST_DEVICE)
set(CUDA_TEST_DEVICE -1)
endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
# Handle clang/libc++ issue
if(APPLE)
xgboost_detect_darwin_version(OSX_VERSION)
# OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
if(OSX_VERSION VERSION_GREATER 10.8)
# enabled by default if and only if CUDA version is less than 7.0
xgboost_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
endif()
endif()

398
cmake/Utils.cmake Normal file
View File

@ -0,0 +1,398 @@
################################################################################################
# Command alias for debugging messages
# Usage:
# dmsg(<message>)
function(dmsg)
message(STATUS ${ARGN})
endfunction()
################################################################################################
# Removes duplicates from list(s)
# Usage:
# xgboost_list_unique(<list_variable> [<list_variable>] [...])
macro(xgboost_list_unique)
foreach(__lst ${ARGN})
if(${__lst})
list(REMOVE_DUPLICATES ${__lst})
endif()
endforeach()
endmacro()
################################################################################################
# Clears variables from list
# Usage:
# xgboost_clear_vars(<variables_list>)
macro(xgboost_clear_vars)
foreach(_var ${ARGN})
unset(${_var})
endforeach()
endmacro()
################################################################################################
# Removes duplicates from string
# Usage:
# xgboost_string_unique(<string_variable>)
function(xgboost_string_unique __string)
if(${__string})
set(__list ${${__string}})
separate_arguments(__list)
list(REMOVE_DUPLICATES __list)
foreach(__e ${__list})
set(__str "${__str} ${__e}")
endforeach()
set(${__string} ${__str} PARENT_SCOPE)
endif()
endfunction()
################################################################################################
# Prints list element per line
# Usage:
# xgboost_print_list(<list>)
function(xgboost_print_list)
foreach(e ${ARGN})
message(STATUS ${e})
endforeach()
endfunction()
################################################################################################
# Function merging lists of compiler flags to single string.
# Usage:
# xgboost_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
function(xgboost_merge_flag_lists out_var)
set(__result "")
foreach(__list ${ARGN})
foreach(__flag ${${__list}})
string(STRIP ${__flag} __flag)
set(__result "${__result} ${__flag}")
endforeach()
endforeach()
string(STRIP ${__result} __result)
set(${out_var} ${__result} PARENT_SCOPE)
endfunction()
################################################################################################
# Converts all paths in list to absolute
# Usage:
# xgboost_convert_absolute_paths(<list_variable>)
function(xgboost_convert_absolute_paths variable)
set(__dlist "")
foreach(__s ${${variable}})
get_filename_component(__abspath ${__s} ABSOLUTE)
list(APPEND __list ${__abspath})
endforeach()
set(${variable} ${__list} PARENT_SCOPE)
endfunction()
################################################################################################
# Reads set of version defines from the header file
# Usage:
# xgboost_parse_header(<file> <define1> <define2> <define3> ..)
macro(xgboost_parse_header FILENAME FILE_VAR)
set(vars_regex "")
set(__parnet_scope OFF)
set(__add_cache OFF)
foreach(name ${ARGN})
if("${name}" STREQUAL "PARENT_SCOPE")
set(__parnet_scope ON)
elseif("${name}" STREQUAL "CACHE")
set(__add_cache ON)
elseif(vars_regex)
set(vars_regex "${vars_regex}|${name}")
else()
set(vars_regex "${name}")
endif()
endforeach()
if(EXISTS "${FILENAME}")
file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
else()
unset(${FILE_VAR})
endif()
foreach(name ${ARGN})
if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
if(${FILE_VAR})
if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
else()
set(${name} "")
endif()
if(__add_cache)
set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
elseif(__parnet_scope)
set(${name} "${${name}}" PARENT_SCOPE)
endif()
else()
unset(${name} CACHE)
endif()
endif()
endforeach()
endmacro()
################################################################################################
# Reads single version define from the header file and parses it
# Usage:
# xgboost_parse_header_single_define(<library_name> <file> <define_name>)
function(xgboost_parse_header_single_define LIBNAME HDR_PATH VARNAME)
set(${LIBNAME}_H "")
if(EXISTS "${HDR_PATH}")
file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
endif()
if(${LIBNAME}_H)
string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR "${${LIBNAME}_H}")
string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
# append a TWEAK version if it exists:
set(${LIBNAME}_VERSION_TWEAK "")
if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
endif()
if(${LIBNAME}_VERSION_TWEAK)
set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
else()
set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
endif()
endif()
endfunction()
########################################################################################################
# An option that the user can select. Can accept condition to control when option is available for user.
# Usage:
# xgboost_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
function(xgboost_option variable description value)
set(__value ${value})
set(__condition "")
set(__varname "__value")
foreach(arg ${ARGN})
if(arg STREQUAL "IF" OR arg STREQUAL "if")
set(__varname "__condition")
else()
list(APPEND ${__varname} ${arg})
endif()
endforeach()
unset(__varname)
if("${__condition}" STREQUAL "")
set(__condition 2 GREATER 1)
endif()
if(${__condition})
if("${__value}" MATCHES ";")
if(${__value})
option(${variable} "${description}" ON)
else()
option(${variable} "${description}" OFF)
endif()
elseif(DEFINED ${__value})
if(${__value})
option(${variable} "${description}" ON)
else()
option(${variable} "${description}" OFF)
endif()
else()
option(${variable} "${description}" ${__value})
endif()
else()
unset(${variable} CACHE)
endif()
endfunction()
################################################################################################
# Utility macro for comparing two lists. Used for CMake debugging purposes
# Usage:
# xgboost_compare_lists(<list_variable> <list2_variable> [description])
function(xgboost_compare_lists list1 list2 desc)
set(__list1 ${${list1}})
set(__list2 ${${list2}})
list(SORT __list1)
list(SORT __list2)
list(LENGTH __list1 __len1)
list(LENGTH __list2 __len2)
if(NOT ${__len1} EQUAL ${__len2})
message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
endif()
foreach(__i RANGE 1 ${__len1})
math(EXPR __index "${__i}- 1")
list(GET __list1 ${__index} __item1)
list(GET __list2 ${__index} __item2)
if(NOT ${__item1} STREQUAL ${__item2})
message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
endif()
endforeach()
endfunction()
################################################################################################
# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
# Usage:
# xgboost_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
macro(xgboost_warnings_disable)
set(_flag_vars "")
set(_msvc_warnings "")
set(_gxx_warnings "")
foreach(arg ${ARGN})
if(arg MATCHES "^CMAKE_")
list(APPEND _flag_vars ${arg})
elseif(arg MATCHES "^/wd")
list(APPEND _msvc_warnings ${arg})
elseif(arg MATCHES "^-W")
list(APPEND _gxx_warnings ${arg})
endif()
endforeach()
if(NOT _flag_vars)
set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
endif()
if(MSVC AND _msvc_warnings)
foreach(var ${_flag_vars})
foreach(warning ${_msvc_warnings})
set(${var} "${${var}} ${warning}")
endforeach()
endforeach()
elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
foreach(var ${_flag_vars})
foreach(warning ${_gxx_warnings})
if(NOT warning MATCHES "^-Wno-")
string(REPLACE "${warning}" "" ${var} "${${var}}")
string(REPLACE "-W" "-Wno-" warning "${warning}")
endif()
set(${var} "${${var}} ${warning}")
endforeach()
endforeach()
endif()
xgboost_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
endmacro()
################################################################################################
# Helper function get current definitions
# Usage:
# xgboost_get_current_definitions(<definitions_variable>)
function(xgboost_get_current_definitions definitions_var)
get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
set(result "")
foreach(d ${current_definitions})
list(APPEND result -D${d})
endforeach()
xgboost_list_unique(result)
set(${definitions_var} ${result} PARENT_SCOPE)
endfunction()
################################################################################################
# Helper function get current includes/definitions
# Usage:
# xgboost_get_current_cflags(<cflagslist_variable>)
function(xgboost_get_current_cflags cflags_var)
get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
xgboost_convert_absolute_paths(current_includes)
xgboost_get_current_definitions(cflags)
foreach(i ${current_includes})
list(APPEND cflags "-I${i}")
endforeach()
xgboost_list_unique(cflags)
set(${cflags_var} ${cflags} PARENT_SCOPE)
endfunction()
################################################################################################
# Helper function to parse current linker libs into link directories, libflags and osx frameworks
# Usage:
# xgboost_parse_linker_libs(<xgboost_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
function(xgboost_parse_linker_libs xgboost_LINKER_LIBS_variable folders_var flags_var frameworks_var)
set(__unspec "")
set(__debug "")
set(__optimized "")
set(__framework "")
set(__varname "__unspec")
# split libs into debug, optimized, unspecified and frameworks
foreach(list_elem ${${xgboost_LINKER_LIBS_variable}})
if(list_elem STREQUAL "debug")
set(__varname "__debug")
elseif(list_elem STREQUAL "optimized")
set(__varname "__optimized")
elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
list(APPEND __framework -framework ${CMAKE_MATCH_1})
else()
list(APPEND ${__varname} ${list_elem})
set(__varname "__unspec")
endif()
endforeach()
# attach debug or optimized libs to unspecified according to current configuration
if(CMAKE_BUILD_TYPE MATCHES "Debug")
set(__libs ${__unspec} ${__debug})
else()
set(__libs ${__unspec} ${__optimized})
endif()
set(libflags "")
set(folders "")
# convert linker libraries list to link flags
foreach(lib ${__libs})
if(TARGET ${lib})
list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
list(APPEND libflags -l${lib})
elseif(lib MATCHES "^-l.*")
list(APPEND libflags ${lib})
elseif(IS_ABSOLUTE ${lib})
get_filename_component(name_we ${lib} NAME_WE)
get_filename_component(folder ${lib} PATH)
string(REGEX MATCH "^lib(.*)" __match ${name_we})
list(APPEND libflags -l${CMAKE_MATCH_1})
list(APPEND folders ${folder})
else()
message(FATAL_ERROR "Logic error. Need to update cmake script")
endif()
endforeach()
xgboost_list_unique(libflags folders)
set(${folders_var} ${folders} PARENT_SCOPE)
set(${flags_var} ${libflags} PARENT_SCOPE)
set(${frameworks_var} ${__framework} PARENT_SCOPE)
endfunction()
################################################################################################
# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
# Usage:
# xgboost_detect_darwin_version(<version_variable>)
function(xgboost_detect_darwin_version output_var)
if(APPLE)
execute_process(COMMAND /usr/bin/sw_vers -productVersion
RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
else()
set(${output_var} "" PARENT_SCOPE)
endif()
endfunction()
################################################################################################
# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
# Usage:
# caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
function(xgboost_source_group group)
cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
if(CAFFE_SOURCE_GROUP_GLOB)
file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
source_group(${group} FILES ${srcs1})
endif()
if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
source_group(${group} FILES ${srcs2})
endif()
endfunction()

View File

@ -24,7 +24,11 @@
// helper functions
// set handle
void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
jlong out = (jlong) handle;
#ifdef __APPLE__
jlong out = (long) handle;
#else
int64_t out = (int64_t) handle;
#endif
jenv->SetLongArrayRegion(jhandle, 0, 1, &out);
}
@ -32,7 +36,7 @@ void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
static JavaVM* global_jvm = nullptr;
// overrides JNI on load
JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
jint JNI_OnLoad(JavaVM *vm, void *reserved) {
global_jvm = vm;
return JNI_VERSION_1_6;
}
@ -72,7 +76,7 @@ XGB_EXTERN_C int XGBoost4jCallbackDataIterNext(
batch, jenv->GetFieldID(batchClass, "featureValue", "[F"));
XGBoostBatchCSR cbatch;
cbatch.size = jenv->GetArrayLength(joffset) - 1;
cbatch.offset = reinterpret_cast<jlong *>(
cbatch.offset = reinterpret_cast<long *>(
jenv->GetLongArrayElements(joffset, 0));
if (jlabel != nullptr) {
cbatch.label = jenv->GetFloatArrayElements(jlabel, 0);

1
nccl Submodule

@ -0,0 +1 @@
Subproject commit 93183bca921b2e8e1754e27e1b43d73cf6caec9d

View File

@ -17,8 +17,11 @@ colsample_bytree | &#10004; | &#10004;|
colsample_bylevel | &#10004; | &#10004; |
max_bin | &#10006; | &#10004; |
gpu_id | &#10004; | &#10004; |
n_gpus | &#10006; | &#10004; |
All algorithms currently use only a single GPU. The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
Multiple GPUs can be used with the grow_gpu_hist parameter using the n_gpus parameter, which defaults to -1 (indicating use all visible GPUs). If gpu_id is specified as non-zero, the gpu device order is mod(gpu_id + i) % n_visible_devices for i=0 to n_gpus-1. As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance. For example, when n_features * n_bins * 2^depth divided by time of each round/iteration becomes comparable to the real PCI 16x bus bandwidth of order 4GB/s to 10GB/s, then AllReduce will dominant code speed and multiple GPUs become ineffective at increasing performance. Also, CPU overhead between GPU calls can limit usefulness of multiple GPUs.
This plugin currently works with the CLI version and python version.
@ -54,29 +57,38 @@ $ python -m nose test/python/
## Dependencies
A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).
Building the plug-in requires CUDA Toolkit 7.5 or later.
Building the plug-in requires CUDA Toolkit 7.5 or later (https://developer.nvidia.com/cuda-downloads)
submodule: The plugin also depends on CUB 1.6.4 - https://nvlabs.github.io/cub/ . CUB is a header only cuda library which provides sort/reduce/scan primitives.
submodule: NVIDIA NCCL from https://github.com/NVIDIA/nccl with windows port allowed by git@github.com:h2oai/nccl.git
## Build
### Using cmake
To use the plugin xgboost must be built by specifying the option PLUGIN_UPDATER_GPU=ON. CMake will prepare a build system depending on which platform you are on.
From the command line on Linux starting from the xgboost directory:
On Linux, from the xgboost directory:
```bash
$ mkdir build
$ cd build
$ cmake .. -DPLUGIN_UPDATER_GPU=ON
$ make
$ make -j
```
If 'make' fails try invoking make again. There can sometimes be problems with the order items are built.
On Windows you may also need to specify your generator as 64 bit, so the cmake command becomes:
On Windows using cmake, see what options for Generators you have for cmake, and choose one with [arch] replaced by Win64:
```bash
$ cmake .. -G"Visual Studio 12 2013 Win64" -DPLUGIN_UPDATER_GPU=ON
cmake -help
```
You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.
cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode. This is also a good time to check it is being built as x64. If not make sure the cmake generator is set correctly.
Then run cmake as:
```bash
$ mkdir build
$ cd build
$ cmake .. -G"Visual Studio 14 2015 Win64" -DPLUGIN_UPDATER_GPU=ON
```
Cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build.
Visual studio community 2015, supported by cuda toolkit (http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/#axzz4isREr2nS), can be downloaded from: https://my.visualstudio.com/Downloads?q=Visual%20Studio%20Community%202015 . You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it. Note that Mingw cannot be used with cuda.
### For Developers!
### Using make
Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
@ -84,9 +96,6 @@ Now, it also supports the usual 'make' flow to build gpu-enabled tree constructi
# make sure CUDA SDK bin directory is in the 'PATH' env variable
$ make PLUGIN_UPDATER_GPU=ON
```
### For Developers!
Now, some of the code-base inside gpu plugins have googletest unit-tests inside 'tests/'.
They can be enabled run along with other unit-tests inside '<xgboostRoot>/tests/cpp' using:
```bash
@ -98,10 +107,17 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
```
## Changelog
##### 2017/6/5
* Multi-GPU support for histogram method using NVIDIA NCCL.
##### 2017/5/31
* Faster version of the grow_gpu plugin
* Added support for building gpu plugin through 'make' flow too
##### 2017/5/19
* Further performance enhancements for histogram method.
##### 2017/5/5
* Histogram performance improvements
* Fix gcc build issues
@ -115,10 +131,19 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
[Mitchell, Rory, and Eibe Frank. Accelerating the XGBoost algorithm using GPU computing. No. e2911v1. PeerJ Preprints, 2017.](https://peerj.com/preprints/2911/)
## Author
<<<<<<< HEAD
Rory Mitchell,
Jonathan C. McKinney,
Shankara Rao Thejaswi Nanditale,
Vinay Deshpande,
and the rest of the H2O.ai and NVIDIA team.
=======
Rory Mitchell
Jonathan C. McKinney
Shankara Rao Thejaswi Nanditale
Vinay Deshpande
... and the rest of the H2O.ai and NVIDIA team.
>>>>>>> d2fbbdf4a39fa1f0af5cbd59a7912cf5caade34e
Please report bugs to the xgboost/issues page. You can tag me with @RAMitchell.
Otherwise I can be contacted at r.a.mitchell.nz at gmail.
Please report bugs to the xgboost/issues page.

View File

@ -1,5 +1,5 @@
/*!
* Copyright 2016 Rory mitchell
* Copyright 2017 XGBoost contributors
*/
#pragma once
#include <vector>
@ -147,7 +147,8 @@ inline void dense2sparse_tree(RegTree* p_tree,
}
// Set gradient pair to 0 with p = 1 - subsample
inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample,
int offset) {
if (subsample == 1.0) {
return;
}
@ -157,13 +158,19 @@ inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
auto d_gpair = gpair.data();
dh::BernoulliRng rng(subsample, common::GlobalRandom()());
dh::launch_n(gpair.size(), [=] __device__(int i) {
if (!rng(i)) {
dh::launch_n(gpair.device_idx(), gpair.size(), [=] __device__(int i) {
if (!rng(i + offset)) {
d_gpair[i] = gpu_gpair();
}
});
}
// Set gradient pair to 0 with p = 1 - subsample
inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
int offset = 0;
subsample_gpair(p_gpair, subsample, offset);
}
inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
int n = colsample * features.size();
CHECK_GT(n, 0);
@ -233,8 +240,8 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
* @param def default value to be filled
*/
template <typename T, int BlkDim=256, int ItemsPerThread=4>
void fillConst(T* out, int len, T def) {
dh::launch_n<ItemsPerThread,BlkDim>(len, [=] __device__(int i) { out[i] = def; });
void fillConst(int device_idx, T* out, int len, T def) {
dh::launch_n<ItemsPerThread,BlkDim>(device_idx, len, [=] __device__(int i) { out[i] = def; });
}
/**
@ -247,10 +254,10 @@ void fillConst(T* out, int len, T def) {
* @param nVals length of the buffers
*/
template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
int nVals) {
dh::launch_n<ItemsPerThread,BlkDim>
(nVals, [=] __device__(int i) {
(device_idx, nVals, [=] __device__(int i) {
int iid = instId[i];
T1 v1 = in1[iid];
T2 v2 = in2[iid];
@ -267,9 +274,9 @@ void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
* @param nVals length of the buffers
*/
template <typename T, int BlkDim=256, int ItemsPerThread=4>
void gather(T* out, const T* in, const int* instId, int nVals) {
void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
dh::launch_n<ItemsPerThread,BlkDim>
(nVals, [=] __device__(int i) {
(device_idx, nVals, [=] __device__(int i) {
int iid = instId[i];
out[i] = in[iid];
});

View File

@ -1,5 +1,5 @@
/*!
* Copyright 2016 Rory mitchell
* Copyright 2017 XGBoost contributors
*/
#pragma once
#include <thrust/device_vector.h>
@ -12,11 +12,20 @@
#include <sstream>
#include <string>
#include <vector>
#include <numeric>
#include <cub/cub.cuh>
#ifndef NCCL
#define NCCL 1
#endif
#if (NCCL)
#include "nccl.h"
#endif
// Uncomment to enable
// #define DEVICE_TIMER
// #define TIMERS
#define TIMERS
namespace dh {
@ -42,6 +51,22 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
return code;
}
#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
#if (NCCL)
inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
ss << file << "(" << line << ")";
throw std::runtime_error(ss.str());
}
return code;
}
#endif
#define gpuErrchk(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
@ -53,6 +78,55 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
}
}
inline int n_visible_devices() {
int n_visgpus = 0;
cudaGetDeviceCount(&n_visgpus);
return n_visgpus;
}
inline int n_devices_all(int n_gpus) {
if (NCCL == 0 && n_gpus > 1 || NCCL == 0 && n_gpus != 0) {
if (n_gpus != 1 && n_gpus != 0) {
fprintf(stderr, "NCCL=0, so forcing n_gpus=1\n");
fflush(stderr);
}
n_gpus = 1;
}
int n_devices_visible = dh::n_visible_devices();
int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
return (n_devices);
}
inline int n_devices(int n_gpus, int num_rows) {
int n_devices = dh::n_devices_all(n_gpus);
// fix-up device number to be limited by number of rows
n_devices = n_devices > num_rows ? num_rows : n_devices;
return (n_devices);
}
// if n_devices=-1, then use all visible devices
inline void synchronize_n_devices(int n_devices, std::vector<int> dList) {
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaDeviceSynchronize());
}
}
inline void synchronize_all() {
for (int device_idx = 0; device_idx < n_visible_devices(); device_idx++) {
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaDeviceSynchronize());
}
}
inline std::string device_name(int device_idx) {
cudaDeviceProp prop;
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
return std::string(prop.name);
}
/*
* Timers
*/
@ -119,7 +193,9 @@ struct DeviceTimer {
#ifdef DEVICE_TIMER
__device__ DeviceTimer(DeviceTimerGlobal &GTimer, int slot) // NOLINT
: GTimer(GTimer), start(clock()), slot(slot) {}
: GTimer(GTimer),
start(clock()),
slot(slot) {}
#else
__device__ DeviceTimer(DeviceTimerGlobal &GTimer, int slot) {} // NOLINT
#endif
@ -146,8 +222,8 @@ struct Timer {
void reset() { start = ClockT::now(); }
int64_t elapsed() const { return (ClockT::now() - start).count(); }
void printElapsed(std::string label) {
safe_cuda(cudaDeviceSynchronize());
printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
// synchronize_n_devices(n_devices, dList);
printf("%s:\t %lld\n", label.c_str(), elapsed());
reset();
}
};
@ -229,43 +305,47 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
* Memory
*/
enum memory_type { DEVICE, DEVICE_MANAGED };
template <memory_type MemoryT>
class bulk_allocator;
template <typename T> class dvec2;
template <typename T>
class dvec {
friend bulk_allocator;
friend class dvec2<T>;
friend class dvec2<T>;
private:
T *_ptr;
size_t _size;
int _device_idx;
void external_allocate(void *ptr, size_t size) {
public:
void external_allocate(int device_idx, void *ptr, size_t size) {
if (!empty()) {
throw std::runtime_error("Tried to allocate dvec but already allocated");
}
_ptr = static_cast<T *>(ptr);
_size = size;
_device_idx = device_idx;
}
public:
dvec() : _ptr(NULL), _size(0) {}
dvec() : _ptr(NULL), _size(0), _device_idx(0) {}
size_t size() const { return _size; }
int device_idx() const { return _device_idx; }
bool empty() const { return _ptr == NULL || _size == 0; }
T *data() { return _ptr; }
std::vector<T> as_vector() const {
std::vector<T> h_vector(size());
safe_cuda(cudaSetDevice(_device_idx));
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
cudaMemcpyDeviceToHost));
return h_vector;
}
void fill(T value) {
safe_cuda(cudaSetDevice(_device_idx));
thrust::fill_n(thrust::device_pointer_cast(_ptr), size(), value);
}
@ -285,11 +365,7 @@ class dvec {
template <typename T2>
dvec &operator=(const std::vector<T2> &other) {
if (other.size() != size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
}
thrust::copy(other.begin(), other.end(), this->tbegin());
this->copy(other.begin(), other.end());
return *this;
}
@ -298,9 +374,25 @@ class dvec {
throw std::runtime_error(
"Cannot copy assign dvec to dvec, sizes are different");
}
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
safe_cuda(cudaSetDevice(this->device_idx()));
if (other.device_idx() == this->device_idx()) {
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
} else {
throw std::runtime_error("Cannot copy to/from different devices");
}
return *this;
}
template <typename IterT>
void copy(IterT begin, IterT end) {
safe_cuda(cudaSetDevice(this->device_idx()));
if (end - begin != size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
}
thrust::copy(begin, end, this->tbegin());
}
};
/**
@ -309,34 +401,34 @@ class dvec {
*/
template <typename T>
class dvec2 {
friend bulk_allocator;
private:
dvec<T> _d1, _d2;
cub::DoubleBuffer<T> _buff;
int _device_idx;
void external_allocate(void *ptr1, void *ptr2, size_t size) {
public:
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
if (!empty()) {
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
}
_d1.external_allocate(ptr1, size);
_d2.external_allocate(ptr2, size);
_d1.external_allocate(_device_idx, ptr1, size);
_d2.external_allocate(_device_idx, ptr2, size);
_buff.d_buffers[0] = static_cast<T *>(ptr1);
_buff.d_buffers[1] = static_cast<T *>(ptr2);
_buff.selector = 0;
_device_idx = device_idx;
}
public:
dvec2() : _d1(), _d2(), _buff() {}
dvec2() : _d1(), _d2(), _buff(), _device_idx(0) {}
size_t size() const { return _d1.size(); }
int device_idx() const { return _device_idx; }
bool empty() const { return _d1.empty() || _d2.empty(); }
cub::DoubleBuffer<T> &buff() { return _buff; }
dvec<T> &d1() { return _d1; }
dvec<T> &d2() { return _d2; }
T *current() { return _buff.Current(); }
@ -346,9 +438,11 @@ class dvec2 {
T *other() { return _buff.Alternate(); }
};
template <memory_type MemoryT>
class bulk_allocator {
char *d_ptr;
size_t _size;
std::vector<char *> d_ptr;
std::vector<size_t> _size;
std::vector<int> _device_idx;
const int align = 256;
@ -369,18 +463,32 @@ class bulk_allocator {
}
template <typename T, typename SizeT>
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size) {
first_vec->external_allocate(static_cast<void *>(ptr), first_size);
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
SizeT first_size) {
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
first_size);
}
template <typename T, typename SizeT, typename... Args>
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
Args... args) {
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
SizeT first_size, Args... args) {
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
first_size);
ptr += align_round_up(first_size * sizeof(T));
allocate_dvec(ptr, args...);
allocate_dvec(device_idx, ptr, args...);
}
// template <memory_type MemoryT>
char *allocate_device(int device_idx, size_t bytes, memory_type t) {
char *ptr;
if (t == memory_type::DEVICE) {
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaMalloc(&ptr, bytes));
} else {
safe_cuda(cudaMallocManaged(&ptr, bytes));
}
return ptr;
}
template <typename T, typename SizeT>
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
return 2 * align_round_up(first_size * sizeof(T));
@ -392,40 +500,46 @@ class bulk_allocator {
}
template <typename T, typename SizeT>
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
first_vec->external_allocate
(static_cast<void *>(ptr),
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size) {
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
first_size);
}
template <typename T, typename SizeT, typename... Args>
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size,
Args... args) {
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
allocate_dvec<T,SizeT>(device_idx, ptr, first_vec, first_size);
ptr += (align_round_up(first_size * sizeof(T)) * 2);
allocate_dvec(ptr, args...);
allocate_dvec(device_idx, ptr, args...);
}
public:
bulk_allocator() : _size(0), d_ptr(NULL) {}
~bulk_allocator() {
if (!(d_ptr == nullptr)) {
safe_cuda(cudaFree(d_ptr));
for (int i = 0; i < d_ptr.size(); i++) {
if (!(d_ptr[i] == nullptr)) {
safe_cuda(cudaSetDevice(_device_idx[i]));
safe_cuda(cudaFree(d_ptr[i]));
}
}
}
size_t size() { return _size; }
// returns sum of bytes for all allocations
size_t size() {
return std::accumulate(_size.begin(), _size.end(), static_cast<size_t>(0));
}
template <typename... Args>
void allocate(Args... args) {
if (d_ptr != NULL) {
throw std::runtime_error("Bulk allocator already allocated");
}
_size = get_size_bytes(args...);
safe_cuda(cudaMalloc(&d_ptr, _size));
allocate_dvec(d_ptr, args...);
void allocate(int device_idx, Args... args) {
size_t size = get_size_bytes(args...);
char *ptr = allocate_device(device_idx, size, MemoryT);
allocate_dvec(device_idx, ptr, args...);
d_ptr.push_back(ptr);
_size.push_back(size);
_device_idx.push_back(device_idx);
}
};
@ -455,19 +569,14 @@ struct CubMemory {
bool IsAllocated() { return d_temp_storage != NULL; }
};
inline size_t available_memory() {
inline size_t available_memory(int device_idx) {
size_t device_free = 0;
size_t device_total = 0;
safe_cuda(cudaMemGetInfo(&device_free, &device_total));
safe_cuda(cudaSetDevice(device_idx));
dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
return device_free;
}
inline std::string device_name() {
cudaDeviceProp prop;
safe_cuda(cudaGetDeviceProperties(&prop, 0));
return std::string(prop.name);
}
/*
* Utility functions
*/
@ -481,7 +590,7 @@ void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
std::cout << "\n";
}
template <typename T>
template <typename T, memory_type MemoryT>
void print(const dvec<T> &v, size_t max_items = 10) {
std::vector<T> h = v.as_vector();
for (int i = 0; i < std::min(max_items, h.size()); i++) {
@ -530,17 +639,46 @@ size_t size_bytes(const thrust::device_vector<T> &v) {
*/
template <typename L>
__global__ void launch_n_kernel(size_t n, L lambda) {
for (auto i : grid_stride_range(static_cast<size_t>(0), n)) {
__global__ void launch_n_kernel(size_t begin, size_t end, L lambda) {
for (auto i : grid_stride_range(begin, end)) {
lambda(i);
}
}
template <typename L>
__global__ void launch_n_kernel(int device_idx, size_t begin, size_t end,
L lambda) {
for (auto i : grid_stride_range(begin, end)) {
lambda(i, device_idx);
}
}
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void launch_n(size_t n, L lambda) {
inline void launch_n(int device_idx, size_t n, L lambda) {
safe_cuda(cudaSetDevice(device_idx));
const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
#if defined(__CUDACC__)
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
lambda);
#endif
}
// if n_devices=-1, then use all visible devices
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void multi_launch_n(size_t n, int n_devices, L lambda) {
n_devices = n_devices < 0 ? n_visible_devices() : n_devices;
CHECK_LE(n_devices, n_visible_devices()) << "Number of devices requested "
"needs to be less than equal to "
"number of visible devices.";
const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
#if defined(__CUDACC__)
n_devices = n_devices > n ? n : n_devices;
for (int device_idx = 0; device_idx < n_devices; device_idx++) {
safe_cuda(cudaSetDevice(device_idx));
size_t begin = (n / n_devices) * device_idx;
size_t end = std::min((n / n_devices) * (device_idx + 1), n);
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(device_idx, begin, end,
lambda);
}
#endif
}

View File

@ -168,7 +168,7 @@ void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
node_id_t nodeStart, int len, const TrainParam param,
ArgMaxByKeyAlgo algo) {
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
switch(algo) {
case ABK_GMEM:

View File

@ -208,7 +208,7 @@ private:
dh::dvec<gpu_gpair> tmpScanGradBuff;
dh::dvec<int> tmpScanKeyBuff;
dh::dvec<int> colIds;
dh::bulk_allocator ba;
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
void findSplit(int level, node_id_t nodeStart, int nNodes) {
reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
@ -226,7 +226,8 @@ private:
void allocateAllData(int offsetSize) {
int tmpBuffSize = scanTempBufferSize(nVals);
ba.allocate(&vals, nVals,
ba.allocate(param.gpu_id,
&vals, nVals,
&vals_cached, nVals,
&instIds, nVals,
&instIds_cached, nVals,
@ -245,7 +246,7 @@ private:
}
void setupOneTimeData(DMatrix& hMat) {
size_t free_memory = dh::available_memory();
size_t free_memory = dh::available_memory(param.gpu_id);
if (!hMat.SingleColBlock()) {
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
}
@ -258,7 +259,7 @@ private:
if (!param.silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
<< free_memory / mb_size << " MB on " << dh::device_name();
<< free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
}
}
@ -340,7 +341,7 @@ private:
colOffsets.data(), vals.current(),
nVals, nCols);
// gather the node assignments across all other columns too
gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
instIds.current(), nVals);
sortKeys(level);
}
@ -351,7 +352,7 @@ private:
// but we don't need more than level+1 bits for sorting!
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
0, level+1);
gather<float,int>(vals.other(), vals.current(), instIds.other(),
gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
instIds.current(), nodeLocations.current(), nVals);
vals.buff().selector ^= 1;
instIds.buff().selector ^= 1;

View File

@ -2,14 +2,10 @@
* Copyright 2016 Rory mitchell
*/
#pragma once
#include "types.cuh"
#include "../../../src/tree/param.h"
#include "../../../src/common/random.h"
#include "../../../src/tree/param.h"
#include "types.cuh"
namespace xgboost {
namespace tree {
} // namespace tree
namespace tree {} // namespace tree
} // namespace xgboost

View File

@ -21,7 +21,8 @@ struct GPUData {
int n_features;
int n_instances;
dh::bulk_allocator ba;
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
// dh::bulk_allocator<int> ba;
GPUTrainingParam param;
dh::dvec<float> fvalues;
@ -72,24 +73,25 @@ struct GPUData {
n_features, foffsets.data(), foffsets.data() + 1);
// Allocate memory
size_t free_memory = dh::available_memory();
ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
&fvalues_cached, in_fvalues.size(), &foffsets,
in_foffsets.size(), &instance_id, in_instance_id.size(),
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
in_instance_id.size(), &feature_id, in_feature_id.size(),
&node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
&node_id_instance, n_instances, &gpair, n_instances, &nodes,
max_nodes, &split_candidates, max_nodes_level * n_features,
&node_sums, max_nodes_level * n_features, &node_offsets,
max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
&sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size,
&feature_flags, n_features, &feature_set, n_features);
size_t free_memory = dh::available_memory(param_in.gpu_id);
ba.allocate(param_in.gpu_id,
&fvalues, in_fvalues.size(), &fvalues_temp,
in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets,
in_foffsets.size(), &instance_id, in_instance_id.size(),
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
n_instances, &gpair, n_instances, &nodes, max_nodes, &split_candidates,
max_nodes_level * n_features, &node_sums, max_nodes_level * n_features,
&node_offsets, max_nodes_level * n_features, &sort_index_in,
in_fvalues.size(), &sort_index_out, in_fvalues.size(), &cub_mem,
cub_mem_size, &feature_flags, n_features, &feature_set, n_features);
if (!param_in.silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
<< free_memory / mb_size << " MB on " << dh::device_name();
<< free_memory / mb_size << " MB on "
<< dh::device_name(param_in.gpu_id);
}
fvalues_cached = in_fvalues;
@ -134,9 +136,10 @@ struct GPUData {
auto d_node_id_instance = node_id_instance.data();
auto d_instance_id = instance_id.data();
dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
});
dh::launch_n(node_id.device_idx(), fvalues.size(),
[=] __device__(bst_uint i) {
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
});
}
};
} // namespace tree

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*!
* Copyright 2016 Rory mitchell
* Copyright 2017 XGBoost contributors
*/
#pragma once
#include <thrust/device_vector.h>
@ -11,6 +11,14 @@
#include "device_helpers.cuh"
#include "types.cuh"
#ifndef NCCL
#define NCCL 1
#endif
#if (NCCL)
#include "nccl.h"
#endif
namespace xgboost {
namespace tree {
@ -18,7 +26,8 @@ namespace tree {
struct DeviceGMat {
dh::dvec<int> gidx;
dh::dvec<int> ridx;
void Init(const common::GHistIndexMatrix &gmat);
void Init(int device_idx, const common::GHistIndexMatrix &gmat,
bst_uint begin, bst_uint end);
};
struct HistBuilder {
@ -31,11 +40,11 @@ struct HistBuilder {
struct DeviceHist {
int n_bins;
dh::dvec<gpu_gpair> hist;
dh::dvec<gpu_gpair> data;
void Init(int max_depth);
void Reset();
void Reset(int device_idx);
HistBuilder GetBuilder();
@ -64,7 +73,9 @@ class GPUHistBuilder {
void FindSplit(int depth);
template <int BLOCK_THREADS>
void FindSplitSpecialize(int depth);
void InitFirstNode();
template <int BLOCK_THREADS>
void LaunchFindSplit(int depth);
void InitFirstNode(const std::vector<bst_gpair> &gpair);
void UpdatePosition(int depth);
void UpdatePositionDense(int depth);
void UpdatePositionSparse(int depth);
@ -80,32 +91,48 @@ class GPUHistBuilder {
MetaInfo *info;
bool initialised;
bool is_dense;
DeviceGMat device_matrix;
const DMatrix *p_last_fmat_;
dh::bulk_allocator ba;
dh::CubMemory cub_mem;
dh::dvec<int> gidx_feature_map;
dh::dvec<int> hist_node_segments;
dh::dvec<int> feature_segments;
dh::dvec<float> gain;
dh::dvec<NodeIdT> position;
dh::dvec<NodeIdT> position_tmp;
dh::dvec<float> gidx_fvalue_map;
dh::dvec<float> fidx_min_map;
DeviceHist hist;
dh::dvec<cub::KeyValuePair<int, float>> argmax;
dh::dvec<gpu_gpair> node_sums;
dh::dvec<gpu_gpair> hist_scan;
dh::dvec<gpu_gpair> device_gpair;
dh::dvec<Node> nodes;
dh::dvec<int> feature_flags;
dh::dvec<bool> left_child_smallest;
dh::dvec<bst_float> prediction_cache;
bool prediction_cache_initialised;
// choose which memory type to use (DEVICE or DEVICE_MANAGED)
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
// dh::bulk_allocator<dh::memory_type::DEVICE_MANAGED> ba; // can't be used
// with NCCL
dh::CubMemory cub_mem;
std::vector<int> feature_set_tree;
std::vector<int> feature_set_level;
bst_uint num_rows;
int n_devices;
// below vectors are for each devices used
std::vector<int> dList;
std::vector<int> device_row_segments;
std::vector<int> device_element_segments;
std::vector<DeviceHist> hist_vec;
std::vector<dh::dvec<Node>> nodes;
std::vector<dh::dvec<Node>> nodes_temp;
std::vector<dh::dvec<Node>> nodes_child_temp;
std::vector<dh::dvec<bool>> left_child_smallest;
std::vector<dh::dvec<bool>> left_child_smallest_temp;
std::vector<dh::dvec<int>> feature_flags;
std::vector<dh::dvec<float>> fidx_min_map;
std::vector<dh::dvec<int>> feature_segments;
std::vector<dh::dvec<bst_float>> prediction_cache;
std::vector<dh::dvec<NodeIdT>> position;
std::vector<dh::dvec<NodeIdT>> position_tmp;
std::vector<DeviceGMat> device_matrix;
std::vector<dh::dvec<gpu_gpair>> device_gpair;
std::vector<dh::dvec<int>> gidx_feature_map;
std::vector<dh::dvec<float>> gidx_fvalue_map;
std::vector<cudaStream_t *> streams;
#if (NCCL)
std::vector<ncclComm_t> comms;
std::vector<std::vector<ncclComm_t>> find_split_comms;
#endif
};
} // namespace tree
} // namespace xgboost

View File

@ -1,5 +1,5 @@
/*!
* Copyright 2016 Rory mitchell
* Copyright 2017 XGBoost contributors
*/
#pragma once
#include <thrust/device_vector.h>

View File

@ -1,5 +1,5 @@
/*!
* Copyright 2016 Rory Mitchell
* Copyright 2017 XGBoost contributors
*/
#include <xgboost/tree_updater.h>
#include <vector>
@ -76,7 +76,7 @@ class GPUHistMaker : public TreeUpdater {
}
bool UpdatePredictionCache(const DMatrix* data,
std::vector<bst_float>* out_preds) override {
std::vector<bst_float>* out_preds) override {
return builder.UpdatePredictionCache(data, out_preds);
}

View File

@ -1,3 +1,4 @@
from __future__ import print_function
#pylint: skip-file
import sys
sys.path.append("../../tests/python")
@ -12,6 +13,10 @@ dpath = '../../demo/data/'
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
print(*args, file=sys.stdout, **kwargs)
class TestGPU(unittest.TestCase):
def test_grow_gpu(self):
@ -58,7 +63,7 @@ class TestGPU(unittest.TestCase):
'max_depth': 3,
'eval_metric': 'auc'}
res = {}
xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert self.non_decreasing(res['test']['auc'])
@ -74,13 +79,13 @@ class TestGPU(unittest.TestCase):
'max_depth': 2,
'eval_metric': 'auc'}
res = {}
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
for j in range(X2.shape[1]):
for i in rng.choice(X2.shape[0], size=10, replace=False):
for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
X2[i, j] = 2
dtrain3 = xgb.DMatrix(X2, label=y2)
@ -92,17 +97,18 @@ class TestGPU(unittest.TestCase):
assert res['train']['auc'][0] >= 0.85
for j in range(X2.shape[1]):
for i in np.random.choice(X2.shape[0], size=10, replace=False):
for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
X2[i, j] = 3
dtrain4 = xgb.DMatrix(X2, label=y2)
res = {}
xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
def test_grow_gpu_hist(self):
n_gpus=-1
tm._skip_if_no_sklearn()
from sklearn.datasets import load_digits
try:
@ -110,122 +116,180 @@ class TestGPU(unittest.TestCase):
except:
from sklearn.cross_validation import train_test_split
# regression test --- hist must be same as exact on all-categorial data
ag_param = {'max_depth': 2,
'tree_method': 'exact',
'nthread': 1,
'eta': 1,
'silent': 1,
'objective': 'binary:logistic',
'eval_metric': 'auc'}
ag_param2 = {'max_depth': 2,
'updater': 'grow_gpu_hist',
'eta': 1,
'silent': 1,
'objective': 'binary:logistic',
'eval_metric': 'auc'}
ag_res = {}
ag_res2 = {}
for max_depth in range(3,10): # TODO: Doesn't work with 2 for some tests
#eprint("max_depth=%d" % (max_depth))
num_rounds = 10
xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=ag_res)
xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=ag_res2)
assert ag_res['train']['auc'] == ag_res2['train']['auc']
assert ag_res['test']['auc'] == ag_res2['test']['auc']
for max_bin_i in range(3,11):
max_bin = np.power(2,max_bin_i)
#eprint("max_bin=%d" % (max_bin))
digits = load_digits(2)
X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': 3,
'eval_metric': 'auc'}
res = {}
xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert self.non_decreasing(res['test']['auc'])
# fail-safe test for dense data
from sklearn.datasets import load_svmlight_file
X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
X2 = X2.toarray()
dtrain2 = xgb.DMatrix(X2, label=y2)
# regression test --- hist must be same as exact on all-categorial data
ag_param = {'max_depth': max_depth,
'tree_method': 'exact',
'nthread': 1,
'eta': 1,
'silent': 1,
'objective': 'binary:logistic',
'eval_metric': 'auc'}
ag_param2 = {'max_depth': max_depth,
'updater': 'grow_gpu_hist',
'eta': 1,
'silent': 1,
'n_gpus': 1,
'objective': 'binary:logistic',
'max_bin': max_bin,
'eval_metric': 'auc'}
ag_param3 = {'max_depth': max_depth,
'updater': 'grow_gpu_hist',
'eta': 1,
'silent': 1,
'n_gpus': n_gpus,
'objective': 'binary:logistic',
'max_bin': max_bin,
'eval_metric': 'auc'}
ag_res = {}
ag_res2 = {}
ag_res3 = {}
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': 2,
'eval_metric': 'auc'}
res = {}
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
num_rounds = 10
#eprint("normal updater");
xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=ag_res)
#eprint("grow_gpu_hist updater 1 gpu");
xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=ag_res2)
#eprint("grow_gpu_hist updater %d gpus" % (n_gpus));
xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=ag_res3)
# assert 1==0
assert ag_res['train']['auc'] == ag_res2['train']['auc']
assert ag_res['test']['auc'] == ag_res2['test']['auc']
assert ag_res['test']['auc'] == ag_res3['test']['auc']
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
######################################################################
digits = load_digits(2)
X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)
for j in range(X2.shape[1]):
for i in rng.choice(X2.shape[0], size=10, replace=False):
X2[i, j] = 2
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': max_depth,
'n_gpus': 1,
'max_bin': max_bin,
'eval_metric': 'auc'}
res = {}
#eprint("digits: grow_gpu_hist updater 1 gpu");
xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
evals_result=res)
assert self.non_decreasing(res['train']['auc'])
#assert self.non_decreasing(res['test']['auc'])
param2 = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': max_depth,
'n_gpus': n_gpus,
'max_bin': max_bin,
'eval_metric': 'auc'}
res2 = {}
#eprint("digits: grow_gpu_hist updater %d gpus" % (n_gpus));
xgb.train(param2, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
evals_result=res2)
assert self.non_decreasing(res2['train']['auc'])
#assert self.non_decreasing(res2['test']['auc'])
assert res['train']['auc'] == res2['train']['auc']
#assert res['test']['auc'] == res2['test']['auc']
dtrain3 = xgb.DMatrix(X2, label=y2)
res = {}
######################################################################
# fail-safe test for dense data
from sklearn.datasets import load_svmlight_file
X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
X2 = X2.toarray()
dtrain2 = xgb.DMatrix(X2, label=y2)
xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': max_depth,
'n_gpus': n_gpus,
'max_bin': max_bin,
'eval_metric': 'auc'}
res = {}
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
assert self.non_decreasing(res['train']['auc'])
if max_bin>32:
assert res['train']['auc'][0] >= 0.85
for j in range(X2.shape[1]):
for i in np.random.choice(X2.shape[0], size=10, replace=False):
X2[i, j] = 3
for j in range(X2.shape[1]):
for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
X2[i, j] = 2
dtrain4 = xgb.DMatrix(X2, label=y2)
res = {}
xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
dtrain3 = xgb.DMatrix(X2, label=y2)
res = {}
xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
if max_bin>32:
assert res['train']['auc'][0] >= 0.85
for j in range(X2.shape[1]):
for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
X2[i, j] = 3
dtrain4 = xgb.DMatrix(X2, label=y2)
res = {}
xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
if max_bin>32:
assert res['train']['auc'][0] >= 0.85
######################################################################
# fail-safe test for max_bin
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': max_depth,
'n_gpus': n_gpus,
'eval_metric': 'auc',
'max_bin': max_bin}
res = {}
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
if max_bin>32:
assert res['train']['auc'][0] >= 0.85
######################################################################
# subsampling
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': max_depth,
'n_gpus': n_gpus,
'eval_metric': 'auc',
'colsample_bytree': 0.5,
'colsample_bylevel': 0.5,
'subsample': 0.5,
'max_bin': max_bin}
res = {}
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
if max_bin>32:
assert res['train']['auc'][0] >= 0.85
######################################################################
# fail-safe test for max_bin=2
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': 2,
'n_gpus': n_gpus,
'eval_metric': 'auc',
'max_bin': 2}
res = {}
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
if max_bin>32:
assert res['train']['auc'][0] >= 0.85
# subsampling
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': 3,
'eval_metric': 'auc',
'colsample_bytree': 0.5,
'colsample_bylevel': 0.5,
'subsample': 0.5
}
res = {}
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
# max_bin = 2048
param = {'objective': 'binary:logistic',
'updater': 'grow_gpu_hist',
'max_depth': 3,
'eval_metric': 'auc',
'max_bin': 2048
}
res = {}
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
def non_decreasing(self, L):
return all((x - y) < 0.001 for x, y in zip(L, L[1:]))

View File

@ -81,6 +81,8 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
std::vector<int> monotone_constraints;
// gpu to use for single gpu algorithms
int gpu_id;
// number of GPUs to use
int n_gpus;
// declare the parameters
DMLC_DECLARE_PARAMETER(TrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
@ -192,6 +194,10 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
.set_lower_bound(0)
.set_default(0)
.describe("gpu to use for single gpu algorithms");
DMLC_DECLARE_FIELD(n_gpus)
.set_lower_bound(-1)
.set_default(-1)
.describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
// add alias of parameters
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);