[GPU-Plugin] Multi-GPU for grow_gpu_hist histogram method using NVIDIA NCCL. (#2395)
This commit is contained in:
parent
e24f25e0c6
commit
41efe32aa5
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -4,6 +4,9 @@
|
||||
[submodule "rabit"]
|
||||
path = rabit
|
||||
url = https://github.com/dmlc/rabit
|
||||
[submodule "nccl"]
|
||||
path = nccl
|
||||
url = https://github.com/dmlc/nccl
|
||||
[submodule "cub"]
|
||||
path = cub
|
||||
url = https://github.com/NVlabs/cub
|
||||
|
||||
@ -94,40 +94,58 @@ if(MSVC)
|
||||
else()
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
#Prevent shared library being called liblibxgboost.so on Linux
|
||||
set(CMAKE_SHARED_LIBRARY_PREFIX "")
|
||||
endif()
|
||||
|
||||
set(LINK_LIBRARIES dmlccore rabit)
|
||||
|
||||
if(PLUGIN_UPDATER_GPU)
|
||||
# nccl
|
||||
set(LINK_LIBRARIES ${LINK_LIBRARIES} nccl)
|
||||
add_subdirectory(nccl)
|
||||
set(NCCL_DIRECTORY ${PROJECT_SOURCE_DIR}/nccl)
|
||||
include_directories(${NCCL_DIRECTORY}/src)
|
||||
set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
|
||||
|
||||
#Find cub
|
||||
set(CUB_DIRECTORY "cub/" CACHE PATH "CUB 1.5.4 directory")
|
||||
set(CUB_DIRECTORY ${PROJECT_SOURCE_DIR}/cub/)
|
||||
include_directories(${CUB_DIRECTORY})
|
||||
|
||||
#Find googletest
|
||||
set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
|
||||
include_directories(${GTEST_DIRECTORY}/include)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_60;-lineinfo;")
|
||||
if(NOT MSVC)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
|
||||
endif()
|
||||
|
||||
# plugin
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lineinfo;--expt-extended-lambda")
|
||||
set(CUDA_SOURCES
|
||||
plugin/updater_gpu/src/updater_gpu.cu
|
||||
plugin/updater_gpu/src/gpu_hist_builder.cu
|
||||
)
|
||||
cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
|
||||
set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
|
||||
include(${PROJECT_SOURCE_DIR}/cmake/Utils.cmake)
|
||||
include(${PROJECT_SOURCE_DIR}/cmake/Cuda.cmake)
|
||||
# use below for forcing specific arch
|
||||
#cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS} -arch=compute_52)
|
||||
# use below for auto-detect, but gpu_grow currently doesn't work with 61
|
||||
xgboost_cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
|
||||
if(MSVC)
|
||||
else()
|
||||
cuda_add_library(updater_gpu STATIC ${CUDA_SOURCES})
|
||||
set(LINK_LIBRARIES ${LINK_LIBRARIES} updater_gpu)
|
||||
endif()
|
||||
else()
|
||||
set(CUDA_OBJS "")
|
||||
set(updater_gpu "")
|
||||
endif()
|
||||
|
||||
add_library(objxgboost OBJECT ${SOURCES})
|
||||
set_target_properties(${objxgboost} PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
|
||||
add_executable(runxgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
|
||||
set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
|
||||
target_link_libraries(runxgboost ${LINK_LIBRARIES})
|
||||
add_library(libxgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
|
||||
add_executable(xgboost $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
|
||||
|
||||
add_library(xgboost SHARED $<TARGET_OBJECTS:objxgboost> ${CUDA_OBJS})
|
||||
target_link_libraries(xgboost ${LINK_LIBRARIES})
|
||||
target_link_libraries(libxgboost ${LINK_LIBRARIES})
|
||||
|
||||
option(JVM_BINDINGS "Build JVM bindings" OFF)
|
||||
|
||||
@ -136,11 +154,11 @@ if(JVM_BINDINGS)
|
||||
|
||||
include_directories(${JNI_INCLUDE_DIRS} jvm-packages/xgboost4j/src/native)
|
||||
|
||||
add_library(xgboost4j SHARED
|
||||
add_library(libxgboost4j SHARED
|
||||
$<TARGET_OBJECTS:objxgboost>
|
||||
${CUDA_OBJS}
|
||||
jvm-packages/xgboost4j/src/native/xgboost4j.cpp)
|
||||
target_link_libraries(xgboost4j
|
||||
target_link_libraries(libxgboost4j
|
||||
${LINK_LIBRARIES}
|
||||
${JNI_LIBRARIES})
|
||||
endif()
|
||||
|
||||
289
cmake/Cuda.cmake
Normal file
289
cmake/Cuda.cmake
Normal file
@ -0,0 +1,289 @@
|
||||
|
||||
include(CheckCXXCompilerFlag)
|
||||
check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)
|
||||
|
||||
################################################################################################
|
||||
# A function for automatic detection of GPUs installed (if autodetection is enabled)
|
||||
# Usage:
|
||||
# mshadow_detect_installed_gpus(out_variable)
|
||||
function(xgboost_detect_installed_gpus out_variable)
|
||||
set(CUDA_gpu_detect_output "")
|
||||
if(NOT CUDA_gpu_detect_output)
|
||||
set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
|
||||
|
||||
file(WRITE ${__cufile} ""
|
||||
"#include <cstdio>\n"
|
||||
"int main()\n"
|
||||
"{\n"
|
||||
" int count = 0;\n"
|
||||
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
|
||||
" if (count == 0) return -1;\n"
|
||||
" for (int device = 0; device < count; ++device)\n"
|
||||
" {\n"
|
||||
" cudaDeviceProp prop;\n"
|
||||
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
|
||||
" std::printf(\"%d.%d \", prop.major, prop.minor);\n"
|
||||
" }\n"
|
||||
" return 0;\n"
|
||||
"}\n")
|
||||
if(MSVC)
|
||||
#find vcvarsall.bat and run it building msvc environment
|
||||
get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
|
||||
find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
|
||||
execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile}
|
||||
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
|
||||
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
|
||||
ERROR_QUIET
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
else()
|
||||
if(CUDA_LIBRARY_PATH)
|
||||
set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
|
||||
endif()
|
||||
execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
|
||||
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
|
||||
RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
|
||||
ERROR_QUIET
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
endif()
|
||||
if(__nvcc_res EQUAL 0)
|
||||
# nvcc outputs text containing line breaks when building with MSVC.
|
||||
# The line below prevents CMake from inserting a variable with line
|
||||
# breaks in the cache
|
||||
string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
|
||||
string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
|
||||
set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from xgboost_detect_gpus tool" FORCE)
|
||||
else()
|
||||
message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT CUDA_gpu_detect_output)
|
||||
message(WARNING "Automatic GPU detection failed. Building for all known architectures (${xgboost_known_gpu_archs}).")
|
||||
set(${out_variable} ${xgboost_known_gpu_archs} PARENT_SCOPE)
|
||||
else()
|
||||
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
################################################################################################
|
||||
# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
|
||||
# Usage:
|
||||
# xgboost_select_nvcc_arch_flags(out_variable)
|
||||
function(xgboost_select_nvcc_arch_flags out_variable)
|
||||
# List of arch names
|
||||
set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
|
||||
set(__archs_name_default "All")
|
||||
if(NOT CMAKE_CROSSCOMPILING)
|
||||
list(APPEND __archs_names "Auto")
|
||||
set(__archs_name_default "Auto")
|
||||
endif()
|
||||
|
||||
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
|
||||
set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
|
||||
set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
|
||||
mark_as_advanced(CUDA_ARCH_NAME)
|
||||
|
||||
# verify CUDA_ARCH_NAME value
|
||||
if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
|
||||
string(REPLACE ";" ", " __archs_names "${__archs_names}")
|
||||
message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
|
||||
endif()
|
||||
|
||||
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
|
||||
set(CUDA_ARCH_BIN ${xgboost_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
||||
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
|
||||
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
|
||||
else()
|
||||
unset(CUDA_ARCH_BIN CACHE)
|
||||
unset(CUDA_ARCH_PTX CACHE)
|
||||
endif()
|
||||
|
||||
if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
|
||||
set(__cuda_arch_bin "20 21(20)")
|
||||
elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
|
||||
set(__cuda_arch_bin "30 35")
|
||||
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
|
||||
set(__cuda_arch_bin "50")
|
||||
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
|
||||
set(__cuda_arch_bin "60 61")
|
||||
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
|
||||
set(__cuda_arch_bin ${xgboost_known_gpu_archs})
|
||||
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
|
||||
xgboost_detect_installed_gpus(__cuda_arch_bin)
|
||||
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
|
||||
set(__cuda_arch_bin ${CUDA_ARCH_BIN})
|
||||
endif()
|
||||
|
||||
# remove dots and convert to lists
|
||||
string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
|
||||
string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
|
||||
string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
|
||||
string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}")
|
||||
xgboost_list_unique(__cuda_arch_bin __cuda_arch_ptx)
|
||||
|
||||
set(__nvcc_flags "")
|
||||
set(__nvcc_archs_readable "")
|
||||
|
||||
# Tell NVCC to add binaries for the specified GPUs
|
||||
foreach(__arch ${__cuda_arch_bin})
|
||||
if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
|
||||
# User explicitly specified PTX for the concrete BIN
|
||||
list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
|
||||
list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
|
||||
else()
|
||||
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
|
||||
list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
|
||||
list(APPEND __nvcc_archs_readable sm_${__arch})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Tell NVCC to add PTX intermediate code for the specified architectures
|
||||
foreach(__arch ${__cuda_arch_ptx})
|
||||
list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
|
||||
list(APPEND __nvcc_archs_readable compute_${__arch})
|
||||
endforeach()
|
||||
|
||||
string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
|
||||
set(${out_variable} ${__nvcc_flags} PARENT_SCOPE)
|
||||
set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Short command for cuda comnpilation
|
||||
# Usage:
|
||||
# xgboost_cuda_compile(<objlist_variable> <cuda_files>)
|
||||
macro(xgboost_cuda_compile objlist_variable)
|
||||
foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
|
||||
set(${var}_backup_in_cuda_compile_ "${${var}}")
|
||||
|
||||
# we remove /EHa as it generates warnings under windows
|
||||
string(REPLACE "/EHa" "" ${var} "${${var}}")
|
||||
|
||||
endforeach()
|
||||
if(UNIX OR APPLE)
|
||||
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
|
||||
endif()
|
||||
|
||||
if(APPLE)
|
||||
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
|
||||
endif()
|
||||
|
||||
set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G -lineinfo")
|
||||
|
||||
if(MSVC)
|
||||
# disable noisy warnings:
|
||||
# 4819: The file contains a character that cannot be represented in the current code page (number).
|
||||
list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
|
||||
foreach(flag_var
|
||||
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
|
||||
if(${flag_var} MATCHES "/MD")
|
||||
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
|
||||
endif(${flag_var} MATCHES "/MD")
|
||||
endforeach(flag_var)
|
||||
endif()
|
||||
|
||||
# If the build system is a container, make sure the nvcc intermediate files
|
||||
# go into the build output area rather than in /tmp, which may run out of space
|
||||
if(IS_CONTAINER_BUILD)
|
||||
set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
|
||||
message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
|
||||
list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
|
||||
endif()
|
||||
|
||||
cuda_compile(cuda_objcs ${ARGN})
|
||||
|
||||
foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
|
||||
set(${var} "${${var}_backup_in_cuda_compile_}")
|
||||
unset(${var}_backup_in_cuda_compile_)
|
||||
endforeach()
|
||||
|
||||
set(${objlist_variable} ${cuda_objcs})
|
||||
endmacro()
|
||||
|
||||
|
||||
################################################################################################
|
||||
### Non macro section
|
||||
################################################################################################
|
||||
|
||||
# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
|
||||
if(NOT CUDA_TOOLKIT_ROOT_DIR)
|
||||
find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
|
||||
if(CUDA_LIBRARY_PATH)
|
||||
get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
|
||||
set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(CUDA 5.5 QUIET REQUIRED)
|
||||
find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand
|
||||
|
||||
if(NOT CUDA_FOUND)
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(HAVE_CUDA TRUE)
|
||||
message(STATUS "CUDA detected: " ${CUDA_VERSION})
|
||||
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
|
||||
list(APPEND xgboost_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
|
||||
${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
|
||||
|
||||
# Known NVIDIA GPU achitectures xgboost can be compiled for.
|
||||
# This list will be used for CUDA_ARCH_NAME = All option
|
||||
if(CUDA_ARCH_ALL)
|
||||
set(xgboost_known_gpu_archs "${CUDA_ARCH_ALL}")
|
||||
else()
|
||||
if(${CUDA_VERSION} GREATER 7.5)
|
||||
set(xgboost_known_gpu_archs "30 35 50 52 60 61")
|
||||
else()
|
||||
set(xgboost_known_gpu_archs "30 35 50 52")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# cudnn detection
|
||||
if(USE_CUDNN)
|
||||
detect_cuDNN()
|
||||
if(HAVE_CUDNN)
|
||||
add_definitions(-DUSE_CUDNN)
|
||||
include_directories(SYSTEM ${CUDNN_INCLUDE})
|
||||
list(APPEND xgboost_LINKER_LIBS ${CUDNN_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# setting nvcc arch flags
|
||||
xgboost_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
|
||||
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
|
||||
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
|
||||
|
||||
# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
|
||||
# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
|
||||
if(Boost_VERSION EQUAL 105500)
|
||||
message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
|
||||
# avoid warning for CMake >= 2.8.12
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
|
||||
endif()
|
||||
|
||||
# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
|
||||
foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
|
||||
list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
|
||||
endforeach()
|
||||
|
||||
# setting default testing device
|
||||
if(NOT CUDA_TEST_DEVICE)
|
||||
set(CUDA_TEST_DEVICE -1)
|
||||
endif()
|
||||
|
||||
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
|
||||
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
|
||||
|
||||
# Handle clang/libc++ issue
|
||||
if(APPLE)
|
||||
xgboost_detect_darwin_version(OSX_VERSION)
|
||||
|
||||
# OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
|
||||
if(OSX_VERSION VERSION_GREATER 10.8)
|
||||
# enabled by default if and only if CUDA version is less than 7.0
|
||||
xgboost_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
|
||||
endif()
|
||||
endif()
|
||||
398
cmake/Utils.cmake
Normal file
398
cmake/Utils.cmake
Normal file
@ -0,0 +1,398 @@
|
||||
################################################################################################
|
||||
# Command alias for debugging messages
|
||||
# Usage:
|
||||
# dmsg(<message>)
|
||||
function(dmsg)
|
||||
message(STATUS ${ARGN})
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Removes duplicates from list(s)
|
||||
# Usage:
|
||||
# xgboost_list_unique(<list_variable> [<list_variable>] [...])
|
||||
macro(xgboost_list_unique)
|
||||
foreach(__lst ${ARGN})
|
||||
if(${__lst})
|
||||
list(REMOVE_DUPLICATES ${__lst})
|
||||
endif()
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
################################################################################################
|
||||
# Clears variables from list
|
||||
# Usage:
|
||||
# xgboost_clear_vars(<variables_list>)
|
||||
macro(xgboost_clear_vars)
|
||||
foreach(_var ${ARGN})
|
||||
unset(${_var})
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
################################################################################################
|
||||
# Removes duplicates from string
|
||||
# Usage:
|
||||
# xgboost_string_unique(<string_variable>)
|
||||
function(xgboost_string_unique __string)
|
||||
if(${__string})
|
||||
set(__list ${${__string}})
|
||||
separate_arguments(__list)
|
||||
list(REMOVE_DUPLICATES __list)
|
||||
foreach(__e ${__list})
|
||||
set(__str "${__str} ${__e}")
|
||||
endforeach()
|
||||
set(${__string} ${__str} PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Prints list element per line
|
||||
# Usage:
|
||||
# xgboost_print_list(<list>)
|
||||
function(xgboost_print_list)
|
||||
foreach(e ${ARGN})
|
||||
message(STATUS ${e})
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Function merging lists of compiler flags to single string.
|
||||
# Usage:
|
||||
# xgboost_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
|
||||
function(xgboost_merge_flag_lists out_var)
|
||||
set(__result "")
|
||||
foreach(__list ${ARGN})
|
||||
foreach(__flag ${${__list}})
|
||||
string(STRIP ${__flag} __flag)
|
||||
set(__result "${__result} ${__flag}")
|
||||
endforeach()
|
||||
endforeach()
|
||||
string(STRIP ${__result} __result)
|
||||
set(${out_var} ${__result} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Converts all paths in list to absolute
|
||||
# Usage:
|
||||
# xgboost_convert_absolute_paths(<list_variable>)
|
||||
function(xgboost_convert_absolute_paths variable)
|
||||
set(__dlist "")
|
||||
foreach(__s ${${variable}})
|
||||
get_filename_component(__abspath ${__s} ABSOLUTE)
|
||||
list(APPEND __list ${__abspath})
|
||||
endforeach()
|
||||
set(${variable} ${__list} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Reads set of version defines from the header file
|
||||
# Usage:
|
||||
# xgboost_parse_header(<file> <define1> <define2> <define3> ..)
|
||||
macro(xgboost_parse_header FILENAME FILE_VAR)
|
||||
set(vars_regex "")
|
||||
set(__parnet_scope OFF)
|
||||
set(__add_cache OFF)
|
||||
foreach(name ${ARGN})
|
||||
if("${name}" STREQUAL "PARENT_SCOPE")
|
||||
set(__parnet_scope ON)
|
||||
elseif("${name}" STREQUAL "CACHE")
|
||||
set(__add_cache ON)
|
||||
elseif(vars_regex)
|
||||
set(vars_regex "${vars_regex}|${name}")
|
||||
else()
|
||||
set(vars_regex "${name}")
|
||||
endif()
|
||||
endforeach()
|
||||
if(EXISTS "${FILENAME}")
|
||||
file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
|
||||
else()
|
||||
unset(${FILE_VAR})
|
||||
endif()
|
||||
foreach(name ${ARGN})
|
||||
if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
|
||||
if(${FILE_VAR})
|
||||
if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
|
||||
string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
|
||||
else()
|
||||
set(${name} "")
|
||||
endif()
|
||||
if(__add_cache)
|
||||
set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
|
||||
elseif(__parnet_scope)
|
||||
set(${name} "${${name}}" PARENT_SCOPE)
|
||||
endif()
|
||||
else()
|
||||
unset(${name} CACHE)
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
################################################################################################
|
||||
# Reads single version define from the header file and parses it
|
||||
# Usage:
|
||||
# xgboost_parse_header_single_define(<library_name> <file> <define_name>)
|
||||
function(xgboost_parse_header_single_define LIBNAME HDR_PATH VARNAME)
|
||||
set(${LIBNAME}_H "")
|
||||
if(EXISTS "${HDR_PATH}")
|
||||
file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
|
||||
endif()
|
||||
|
||||
if(${LIBNAME}_H)
|
||||
string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
|
||||
string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR "${${LIBNAME}_H}")
|
||||
string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
|
||||
set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
|
||||
set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
|
||||
set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
|
||||
set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
|
||||
|
||||
# append a TWEAK version if it exists:
|
||||
set(${LIBNAME}_VERSION_TWEAK "")
|
||||
if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
|
||||
set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
|
||||
endif()
|
||||
if(${LIBNAME}_VERSION_TWEAK)
|
||||
set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
|
||||
else()
|
||||
set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
########################################################################################################
|
||||
# An option that the user can select. Can accept condition to control when option is available for user.
|
||||
# Usage:
|
||||
# xgboost_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
|
||||
function(xgboost_option variable description value)
|
||||
set(__value ${value})
|
||||
set(__condition "")
|
||||
set(__varname "__value")
|
||||
foreach(arg ${ARGN})
|
||||
if(arg STREQUAL "IF" OR arg STREQUAL "if")
|
||||
set(__varname "__condition")
|
||||
else()
|
||||
list(APPEND ${__varname} ${arg})
|
||||
endif()
|
||||
endforeach()
|
||||
unset(__varname)
|
||||
if("${__condition}" STREQUAL "")
|
||||
set(__condition 2 GREATER 1)
|
||||
endif()
|
||||
|
||||
if(${__condition})
|
||||
if("${__value}" MATCHES ";")
|
||||
if(${__value})
|
||||
option(${variable} "${description}" ON)
|
||||
else()
|
||||
option(${variable} "${description}" OFF)
|
||||
endif()
|
||||
elseif(DEFINED ${__value})
|
||||
if(${__value})
|
||||
option(${variable} "${description}" ON)
|
||||
else()
|
||||
option(${variable} "${description}" OFF)
|
||||
endif()
|
||||
else()
|
||||
option(${variable} "${description}" ${__value})
|
||||
endif()
|
||||
else()
|
||||
unset(${variable} CACHE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Utility macro for comparing two lists. Used for CMake debugging purposes
|
||||
# Usage:
|
||||
# xgboost_compare_lists(<list_variable> <list2_variable> [description])
|
||||
function(xgboost_compare_lists list1 list2 desc)
|
||||
set(__list1 ${${list1}})
|
||||
set(__list2 ${${list2}})
|
||||
list(SORT __list1)
|
||||
list(SORT __list2)
|
||||
list(LENGTH __list1 __len1)
|
||||
list(LENGTH __list2 __len2)
|
||||
|
||||
if(NOT ${__len1} EQUAL ${__len2})
|
||||
message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
|
||||
endif()
|
||||
|
||||
foreach(__i RANGE 1 ${__len1})
|
||||
math(EXPR __index "${__i}- 1")
|
||||
list(GET __list1 ${__index} __item1)
|
||||
list(GET __list2 ${__index} __item2)
|
||||
if(NOT ${__item1} STREQUAL ${__item2})
|
||||
message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
|
||||
# Usage:
|
||||
# xgboost_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
|
||||
macro(xgboost_warnings_disable)
|
||||
set(_flag_vars "")
|
||||
set(_msvc_warnings "")
|
||||
set(_gxx_warnings "")
|
||||
|
||||
foreach(arg ${ARGN})
|
||||
if(arg MATCHES "^CMAKE_")
|
||||
list(APPEND _flag_vars ${arg})
|
||||
elseif(arg MATCHES "^/wd")
|
||||
list(APPEND _msvc_warnings ${arg})
|
||||
elseif(arg MATCHES "^-W")
|
||||
list(APPEND _gxx_warnings ${arg})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if(NOT _flag_vars)
|
||||
set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
|
||||
endif()
|
||||
|
||||
if(MSVC AND _msvc_warnings)
|
||||
foreach(var ${_flag_vars})
|
||||
foreach(warning ${_msvc_warnings})
|
||||
set(${var} "${${var}} ${warning}")
|
||||
endforeach()
|
||||
endforeach()
|
||||
elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
|
||||
foreach(var ${_flag_vars})
|
||||
foreach(warning ${_gxx_warnings})
|
||||
if(NOT warning MATCHES "^-Wno-")
|
||||
string(REPLACE "${warning}" "" ${var} "${${var}}")
|
||||
string(REPLACE "-W" "-Wno-" warning "${warning}")
|
||||
endif()
|
||||
set(${var} "${${var}} ${warning}")
|
||||
endforeach()
|
||||
endforeach()
|
||||
endif()
|
||||
xgboost_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
|
||||
endmacro()
|
||||
|
||||
################################################################################################
|
||||
# Helper function get current definitions
|
||||
# Usage:
|
||||
# xgboost_get_current_definitions(<definitions_variable>)
|
||||
function(xgboost_get_current_definitions definitions_var)
|
||||
get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
|
||||
set(result "")
|
||||
|
||||
foreach(d ${current_definitions})
|
||||
list(APPEND result -D${d})
|
||||
endforeach()
|
||||
|
||||
xgboost_list_unique(result)
|
||||
set(${definitions_var} ${result} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Helper function get current includes/definitions
|
||||
# Usage:
|
||||
# xgboost_get_current_cflags(<cflagslist_variable>)
|
||||
function(xgboost_get_current_cflags cflags_var)
|
||||
get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
|
||||
xgboost_convert_absolute_paths(current_includes)
|
||||
xgboost_get_current_definitions(cflags)
|
||||
|
||||
foreach(i ${current_includes})
|
||||
list(APPEND cflags "-I${i}")
|
||||
endforeach()
|
||||
|
||||
xgboost_list_unique(cflags)
|
||||
set(${cflags_var} ${cflags} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Helper function to parse current linker libs into link directories, libflags and osx frameworks
|
||||
# Usage:
|
||||
# xgboost_parse_linker_libs(<xgboost_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
|
||||
function(xgboost_parse_linker_libs xgboost_LINKER_LIBS_variable folders_var flags_var frameworks_var)
|
||||
|
||||
set(__unspec "")
|
||||
set(__debug "")
|
||||
set(__optimized "")
|
||||
set(__framework "")
|
||||
set(__varname "__unspec")
|
||||
|
||||
# split libs into debug, optimized, unspecified and frameworks
|
||||
foreach(list_elem ${${xgboost_LINKER_LIBS_variable}})
|
||||
if(list_elem STREQUAL "debug")
|
||||
set(__varname "__debug")
|
||||
elseif(list_elem STREQUAL "optimized")
|
||||
set(__varname "__optimized")
|
||||
elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
|
||||
list(APPEND __framework -framework ${CMAKE_MATCH_1})
|
||||
else()
|
||||
list(APPEND ${__varname} ${list_elem})
|
||||
set(__varname "__unspec")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# attach debug or optimized libs to unspecified according to current configuration
|
||||
if(CMAKE_BUILD_TYPE MATCHES "Debug")
|
||||
set(__libs ${__unspec} ${__debug})
|
||||
else()
|
||||
set(__libs ${__unspec} ${__optimized})
|
||||
endif()
|
||||
|
||||
set(libflags "")
|
||||
set(folders "")
|
||||
|
||||
# convert linker libraries list to link flags
|
||||
foreach(lib ${__libs})
|
||||
if(TARGET ${lib})
|
||||
list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
|
||||
list(APPEND libflags -l${lib})
|
||||
elseif(lib MATCHES "^-l.*")
|
||||
list(APPEND libflags ${lib})
|
||||
elseif(IS_ABSOLUTE ${lib})
|
||||
get_filename_component(name_we ${lib} NAME_WE)
|
||||
get_filename_component(folder ${lib} PATH)
|
||||
|
||||
string(REGEX MATCH "^lib(.*)" __match ${name_we})
|
||||
list(APPEND libflags -l${CMAKE_MATCH_1})
|
||||
list(APPEND folders ${folder})
|
||||
else()
|
||||
message(FATAL_ERROR "Logic error. Need to update cmake script")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
xgboost_list_unique(libflags folders)
|
||||
|
||||
set(${folders_var} ${folders} PARENT_SCOPE)
|
||||
set(${flags_var} ${libflags} PARENT_SCOPE)
|
||||
set(${frameworks_var} ${__framework} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
|
||||
# Usage:
|
||||
# xgboost_detect_darwin_version(<version_variable>)
|
||||
function(xgboost_detect_darwin_version output_var)
|
||||
if(APPLE)
|
||||
execute_process(COMMAND /usr/bin/sw_vers -productVersion
|
||||
RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
|
||||
else()
|
||||
set(${output_var} "" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
################################################################################################
|
||||
# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
|
||||
# Usage:
|
||||
# caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
|
||||
function(xgboost_source_group group)
|
||||
cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
|
||||
if(CAFFE_SOURCE_GROUP_GLOB)
|
||||
file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
|
||||
source_group(${group} FILES ${srcs1})
|
||||
endif()
|
||||
|
||||
if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
|
||||
file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
|
||||
source_group(${group} FILES ${srcs2})
|
||||
endif()
|
||||
endfunction()
|
||||
@ -24,7 +24,11 @@
|
||||
// helper functions
|
||||
// set handle
|
||||
void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
|
||||
jlong out = (jlong) handle;
|
||||
#ifdef __APPLE__
|
||||
jlong out = (long) handle;
|
||||
#else
|
||||
int64_t out = (int64_t) handle;
|
||||
#endif
|
||||
jenv->SetLongArrayRegion(jhandle, 0, 1, &out);
|
||||
}
|
||||
|
||||
@ -32,7 +36,7 @@ void setHandle(JNIEnv *jenv, jlongArray jhandle, void* handle) {
|
||||
static JavaVM* global_jvm = nullptr;
|
||||
|
||||
// overrides JNI on load
|
||||
JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
|
||||
jint JNI_OnLoad(JavaVM *vm, void *reserved) {
|
||||
global_jvm = vm;
|
||||
return JNI_VERSION_1_6;
|
||||
}
|
||||
@ -72,7 +76,7 @@ XGB_EXTERN_C int XGBoost4jCallbackDataIterNext(
|
||||
batch, jenv->GetFieldID(batchClass, "featureValue", "[F"));
|
||||
XGBoostBatchCSR cbatch;
|
||||
cbatch.size = jenv->GetArrayLength(joffset) - 1;
|
||||
cbatch.offset = reinterpret_cast<jlong *>(
|
||||
cbatch.offset = reinterpret_cast<long *>(
|
||||
jenv->GetLongArrayElements(joffset, 0));
|
||||
if (jlabel != nullptr) {
|
||||
cbatch.label = jenv->GetFloatArrayElements(jlabel, 0);
|
||||
|
||||
1
nccl
Submodule
1
nccl
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 93183bca921b2e8e1754e27e1b43d73cf6caec9d
|
||||
@ -17,8 +17,11 @@ colsample_bytree | ✔ | ✔|
|
||||
colsample_bylevel | ✔ | ✔ |
|
||||
max_bin | ✖ | ✔ |
|
||||
gpu_id | ✔ | ✔ |
|
||||
n_gpus | ✖ | ✔ |
|
||||
|
||||
All algorithms currently use only a single GPU. The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
|
||||
The device ordinal can be selected using the 'gpu_id' parameter, which defaults to 0.
|
||||
|
||||
Multiple GPUs can be used with the grow_gpu_hist parameter using the n_gpus parameter, which defaults to -1 (indicating use all visible GPUs). If gpu_id is specified as non-zero, the gpu device order is mod(gpu_id + i) % n_visible_devices for i=0 to n_gpus-1. As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance. For example, when n_features * n_bins * 2^depth divided by time of each round/iteration becomes comparable to the real PCI 16x bus bandwidth of order 4GB/s to 10GB/s, then AllReduce will dominant code speed and multiple GPUs become ineffective at increasing performance. Also, CPU overhead between GPU calls can limit usefulness of multiple GPUs.
|
||||
|
||||
This plugin currently works with the CLI version and python version.
|
||||
|
||||
@ -54,29 +57,38 @@ $ python -m nose test/python/
|
||||
## Dependencies
|
||||
A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).
|
||||
|
||||
Building the plug-in requires CUDA Toolkit 7.5 or later.
|
||||
Building the plug-in requires CUDA Toolkit 7.5 or later (https://developer.nvidia.com/cuda-downloads)
|
||||
|
||||
submodule: The plugin also depends on CUB 1.6.4 - https://nvlabs.github.io/cub/ . CUB is a header only cuda library which provides sort/reduce/scan primitives.
|
||||
|
||||
submodule: NVIDIA NCCL from https://github.com/NVIDIA/nccl with windows port allowed by git@github.com:h2oai/nccl.git
|
||||
|
||||
## Build
|
||||
|
||||
### Using cmake
|
||||
To use the plugin xgboost must be built by specifying the option PLUGIN_UPDATER_GPU=ON. CMake will prepare a build system depending on which platform you are on.
|
||||
From the command line on Linux starting from the xgboost directory:
|
||||
|
||||
On Linux, from the xgboost directory:
|
||||
```bash
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake .. -DPLUGIN_UPDATER_GPU=ON
|
||||
$ make
|
||||
$ make -j
|
||||
```
|
||||
If 'make' fails try invoking make again. There can sometimes be problems with the order items are built.
|
||||
|
||||
On Windows you may also need to specify your generator as 64 bit, so the cmake command becomes:
|
||||
On Windows using cmake, see what options for Generators you have for cmake, and choose one with [arch] replaced by Win64:
|
||||
```bash
|
||||
$ cmake .. -G"Visual Studio 12 2013 Win64" -DPLUGIN_UPDATER_GPU=ON
|
||||
cmake -help
|
||||
```
|
||||
You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it.
|
||||
cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode. This is also a good time to check it is being built as x64. If not make sure the cmake generator is set correctly.
|
||||
Then run cmake as:
|
||||
```bash
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake .. -G"Visual Studio 14 2015 Win64" -DPLUGIN_UPDATER_GPU=ON
|
||||
```
|
||||
Cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build.
|
||||
|
||||
Visual studio community 2015, supported by cuda toolkit (http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/#axzz4isREr2nS), can be downloaded from: https://my.visualstudio.com/Downloads?q=Visual%20Studio%20Community%202015 . You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it. Note that Mingw cannot be used with cuda.
|
||||
|
||||
### For Developers!
|
||||
|
||||
### Using make
|
||||
Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
|
||||
@ -84,9 +96,6 @@ Now, it also supports the usual 'make' flow to build gpu-enabled tree constructi
|
||||
# make sure CUDA SDK bin directory is in the 'PATH' env variable
|
||||
$ make PLUGIN_UPDATER_GPU=ON
|
||||
```
|
||||
|
||||
### For Developers!
|
||||
|
||||
Now, some of the code-base inside gpu plugins have googletest unit-tests inside 'tests/'.
|
||||
They can be enabled run along with other unit-tests inside '<xgboostRoot>/tests/cpp' using:
|
||||
```bash
|
||||
@ -98,10 +107,17 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
|
||||
```
|
||||
|
||||
## Changelog
|
||||
##### 2017/6/5
|
||||
|
||||
* Multi-GPU support for histogram method using NVIDIA NCCL.
|
||||
|
||||
##### 2017/5/31
|
||||
* Faster version of the grow_gpu plugin
|
||||
* Added support for building gpu plugin through 'make' flow too
|
||||
|
||||
##### 2017/5/19
|
||||
* Further performance enhancements for histogram method.
|
||||
|
||||
##### 2017/5/5
|
||||
* Histogram performance improvements
|
||||
* Fix gcc build issues
|
||||
@ -115,10 +131,19 @@ $ make PLUGIN_UPDATER_GPU=ON GTEST_PATH=${CACHE_PREFIX} test
|
||||
[Mitchell, Rory, and Eibe Frank. Accelerating the XGBoost algorithm using GPU computing. No. e2911v1. PeerJ Preprints, 2017.](https://peerj.com/preprints/2911/)
|
||||
|
||||
## Author
|
||||
<<<<<<< HEAD
|
||||
Rory Mitchell,
|
||||
Jonathan C. McKinney,
|
||||
Shankara Rao Thejaswi Nanditale,
|
||||
Vinay Deshpande,
|
||||
and the rest of the H2O.ai and NVIDIA team.
|
||||
=======
|
||||
Rory Mitchell
|
||||
Jonathan C. McKinney
|
||||
Shankara Rao Thejaswi Nanditale
|
||||
Vinay Deshpande
|
||||
... and the rest of the H2O.ai and NVIDIA team.
|
||||
>>>>>>> d2fbbdf4a39fa1f0af5cbd59a7912cf5caade34e
|
||||
|
||||
Please report bugs to the xgboost/issues page. You can tag me with @RAMitchell.
|
||||
|
||||
Otherwise I can be contacted at r.a.mitchell.nz at gmail.
|
||||
|
||||
Please report bugs to the xgboost/issues page.
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <vector>
|
||||
@ -147,7 +147,8 @@ inline void dense2sparse_tree(RegTree* p_tree,
|
||||
}
|
||||
|
||||
// Set gradient pair to 0 with p = 1 - subsample
|
||||
inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
|
||||
inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample,
|
||||
int offset) {
|
||||
if (subsample == 1.0) {
|
||||
return;
|
||||
}
|
||||
@ -157,13 +158,19 @@ inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
|
||||
auto d_gpair = gpair.data();
|
||||
dh::BernoulliRng rng(subsample, common::GlobalRandom()());
|
||||
|
||||
dh::launch_n(gpair.size(), [=] __device__(int i) {
|
||||
if (!rng(i)) {
|
||||
dh::launch_n(gpair.device_idx(), gpair.size(), [=] __device__(int i) {
|
||||
if (!rng(i + offset)) {
|
||||
d_gpair[i] = gpu_gpair();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Set gradient pair to 0 with p = 1 - subsample
|
||||
inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
|
||||
int offset = 0;
|
||||
subsample_gpair(p_gpair, subsample, offset);
|
||||
}
|
||||
|
||||
inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
|
||||
int n = colsample * features.size();
|
||||
CHECK_GT(n, 0);
|
||||
@ -233,8 +240,8 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
* @param def default value to be filled
|
||||
*/
|
||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
||||
void fillConst(T* out, int len, T def) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>(len, [=] __device__(int i) { out[i] = def; });
|
||||
void fillConst(int device_idx, T* out, int len, T def) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>(device_idx, len, [=] __device__(int i) { out[i] = def; });
|
||||
}
|
||||
|
||||
/**
|
||||
@ -247,10 +254,10 @@ void fillConst(T* out, int len, T def) {
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
|
||||
void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
|
||||
void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
|
||||
int nVals) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>
|
||||
(nVals, [=] __device__(int i) {
|
||||
(device_idx, nVals, [=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
T1 v1 = in1[iid];
|
||||
T2 v2 = in2[iid];
|
||||
@ -267,9 +274,9 @@ void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
||||
void gather(T* out, const T* in, const int* instId, int nVals) {
|
||||
void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>
|
||||
(nVals, [=] __device__(int i) {
|
||||
(device_idx, nVals, [=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
out[i] = in[iid];
|
||||
});
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <thrust/device_vector.h>
|
||||
@ -12,11 +12,20 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
#ifndef NCCL
|
||||
#define NCCL 1
|
||||
#endif
|
||||
|
||||
#if (NCCL)
|
||||
#include "nccl.h"
|
||||
#endif
|
||||
|
||||
// Uncomment to enable
|
||||
// #define DEVICE_TIMER
|
||||
// #define TIMERS
|
||||
#define TIMERS
|
||||
|
||||
namespace dh {
|
||||
|
||||
@ -42,6 +51,22 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
|
||||
return code;
|
||||
}
|
||||
|
||||
#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
|
||||
|
||||
#if (NCCL)
|
||||
inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
|
||||
ss << file << "(" << line << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define gpuErrchk(ans) \
|
||||
{ gpuAssert((ans), __FILE__, __LINE__); }
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line,
|
||||
@ -53,6 +78,55 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
|
||||
}
|
||||
}
|
||||
|
||||
inline int n_visible_devices() {
|
||||
int n_visgpus = 0;
|
||||
|
||||
cudaGetDeviceCount(&n_visgpus);
|
||||
|
||||
return n_visgpus;
|
||||
}
|
||||
|
||||
inline int n_devices_all(int n_gpus) {
|
||||
if (NCCL == 0 && n_gpus > 1 || NCCL == 0 && n_gpus != 0) {
|
||||
if (n_gpus != 1 && n_gpus != 0) {
|
||||
fprintf(stderr, "NCCL=0, so forcing n_gpus=1\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
n_gpus = 1;
|
||||
}
|
||||
int n_devices_visible = dh::n_visible_devices();
|
||||
int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
|
||||
return (n_devices);
|
||||
}
|
||||
inline int n_devices(int n_gpus, int num_rows) {
|
||||
int n_devices = dh::n_devices_all(n_gpus);
|
||||
// fix-up device number to be limited by number of rows
|
||||
n_devices = n_devices > num_rows ? num_rows : n_devices;
|
||||
return (n_devices);
|
||||
}
|
||||
|
||||
// if n_devices=-1, then use all visible devices
|
||||
inline void synchronize_n_devices(int n_devices, std::vector<int> dList) {
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
inline void synchronize_all() {
|
||||
for (int device_idx = 0; device_idx < n_visible_devices(); device_idx++) {
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string device_name(int device_idx) {
|
||||
cudaDeviceProp prop;
|
||||
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
|
||||
return std::string(prop.name);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Timers
|
||||
*/
|
||||
@ -119,7 +193,9 @@ struct DeviceTimer {
|
||||
|
||||
#ifdef DEVICE_TIMER
|
||||
__device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) // NOLINT
|
||||
: GTimer(GTimer), start(clock()), slot(slot) {}
|
||||
: GTimer(GTimer),
|
||||
start(clock()),
|
||||
slot(slot) {}
|
||||
#else
|
||||
__device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) {} // NOLINT
|
||||
#endif
|
||||
@ -146,8 +222,8 @@ struct Timer {
|
||||
void reset() { start = ClockT::now(); }
|
||||
int64_t elapsed() const { return (ClockT::now() - start).count(); }
|
||||
void printElapsed(std::string label) {
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
|
||||
// synchronize_n_devices(n_devices, dList);
|
||||
printf("%s:\t %lld\n", label.c_str(), elapsed());
|
||||
reset();
|
||||
}
|
||||
};
|
||||
@ -229,43 +305,47 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
* Memory
|
||||
*/
|
||||
|
||||
enum memory_type { DEVICE, DEVICE_MANAGED };
|
||||
|
||||
template <memory_type MemoryT>
|
||||
class bulk_allocator;
|
||||
template <typename T> class dvec2;
|
||||
|
||||
template <typename T>
|
||||
class dvec {
|
||||
friend bulk_allocator;
|
||||
friend class dvec2<T>;
|
||||
|
||||
private:
|
||||
T *_ptr;
|
||||
size_t _size;
|
||||
int _device_idx;
|
||||
|
||||
void external_allocate(void *ptr, size_t size) {
|
||||
public:
|
||||
void external_allocate(int device_idx, void *ptr, size_t size) {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec but already allocated");
|
||||
}
|
||||
_ptr = static_cast<T *>(ptr);
|
||||
_size = size;
|
||||
_device_idx = device_idx;
|
||||
}
|
||||
|
||||
public:
|
||||
dvec() : _ptr(NULL), _size(0) {}
|
||||
|
||||
dvec() : _ptr(NULL), _size(0), _device_idx(0) {}
|
||||
size_t size() const { return _size; }
|
||||
|
||||
int device_idx() const { return _device_idx; }
|
||||
bool empty() const { return _ptr == NULL || _size == 0; }
|
||||
|
||||
T *data() { return _ptr; }
|
||||
|
||||
std::vector<T> as_vector() const {
|
||||
std::vector<T> h_vector(size());
|
||||
safe_cuda(cudaSetDevice(_device_idx));
|
||||
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return h_vector;
|
||||
}
|
||||
|
||||
void fill(T value) {
|
||||
safe_cuda(cudaSetDevice(_device_idx));
|
||||
thrust::fill_n(thrust::device_pointer_cast(_ptr), size(), value);
|
||||
}
|
||||
|
||||
@ -285,11 +365,7 @@ class dvec {
|
||||
|
||||
template <typename T2>
|
||||
dvec &operator=(const std::vector<T2> &other) {
|
||||
if (other.size() != size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
}
|
||||
thrust::copy(other.begin(), other.end(), this->tbegin());
|
||||
this->copy(other.begin(), other.end());
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -298,9 +374,25 @@ class dvec {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign dvec to dvec, sizes are different");
|
||||
}
|
||||
safe_cuda(cudaSetDevice(this->device_idx()));
|
||||
if (other.device_idx() == this->device_idx()) {
|
||||
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
|
||||
} else {
|
||||
throw std::runtime_error("Cannot copy to/from different devices");
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename IterT>
|
||||
void copy(IterT begin, IterT end) {
|
||||
safe_cuda(cudaSetDevice(this->device_idx()));
|
||||
if (end - begin != size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
}
|
||||
thrust::copy(begin, end, this->tbegin());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
@ -309,34 +401,34 @@ class dvec {
|
||||
*/
|
||||
template <typename T>
|
||||
class dvec2 {
|
||||
friend bulk_allocator;
|
||||
|
||||
private:
|
||||
dvec<T> _d1, _d2;
|
||||
cub::DoubleBuffer<T> _buff;
|
||||
int _device_idx;
|
||||
|
||||
void external_allocate(void *ptr1, void *ptr2, size_t size) {
|
||||
|
||||
public:
|
||||
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
|
||||
}
|
||||
_d1.external_allocate(ptr1, size);
|
||||
_d2.external_allocate(ptr2, size);
|
||||
_d1.external_allocate(_device_idx, ptr1, size);
|
||||
_d2.external_allocate(_device_idx, ptr2, size);
|
||||
_buff.d_buffers[0] = static_cast<T *>(ptr1);
|
||||
_buff.d_buffers[1] = static_cast<T *>(ptr2);
|
||||
_buff.selector = 0;
|
||||
_device_idx = device_idx;
|
||||
}
|
||||
|
||||
public:
|
||||
dvec2() : _d1(), _d2(), _buff() {}
|
||||
dvec2() : _d1(), _d2(), _buff(), _device_idx(0) {}
|
||||
|
||||
size_t size() const { return _d1.size(); }
|
||||
|
||||
int device_idx() const { return _device_idx; }
|
||||
bool empty() const { return _d1.empty() || _d2.empty(); }
|
||||
|
||||
cub::DoubleBuffer<T> &buff() { return _buff; }
|
||||
|
||||
dvec<T> &d1() { return _d1; }
|
||||
|
||||
dvec<T> &d2() { return _d2; }
|
||||
|
||||
T *current() { return _buff.Current(); }
|
||||
@ -346,9 +438,11 @@ class dvec2 {
|
||||
T *other() { return _buff.Alternate(); }
|
||||
};
|
||||
|
||||
template <memory_type MemoryT>
|
||||
class bulk_allocator {
|
||||
char *d_ptr;
|
||||
size_t _size;
|
||||
std::vector<char *> d_ptr;
|
||||
std::vector<size_t> _size;
|
||||
std::vector<int> _device_idx;
|
||||
|
||||
const int align = 256;
|
||||
|
||||
@ -369,18 +463,32 @@ class bulk_allocator {
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size) {
|
||||
first_vec->external_allocate(static_cast<void *>(ptr), first_size);
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
SizeT first_size) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
|
||||
Args... args) {
|
||||
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
SizeT first_size, Args... args) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
first_size);
|
||||
ptr += align_round_up(first_size * sizeof(T));
|
||||
allocate_dvec(ptr, args...);
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
}
|
||||
|
||||
// template <memory_type MemoryT>
|
||||
char *allocate_device(int device_idx, size_t bytes, memory_type t) {
|
||||
char *ptr;
|
||||
if (t == memory_type::DEVICE) {
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaMalloc(&ptr, bytes));
|
||||
} else {
|
||||
safe_cuda(cudaMallocManaged(&ptr, bytes));
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
template <typename T, typename SizeT>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
|
||||
return 2 * align_round_up(first_size * sizeof(T));
|
||||
@ -392,40 +500,46 @@ class bulk_allocator {
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
|
||||
first_vec->external_allocate
|
||||
(static_cast<void *>(ptr),
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size,
|
||||
Args... args) {
|
||||
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
|
||||
allocate_dvec<T,SizeT>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
||||
allocate_dvec(ptr, args...);
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
}
|
||||
|
||||
public:
|
||||
bulk_allocator() : _size(0), d_ptr(NULL) {}
|
||||
|
||||
~bulk_allocator() {
|
||||
if (!(d_ptr == nullptr)) {
|
||||
safe_cuda(cudaFree(d_ptr));
|
||||
for (int i = 0; i < d_ptr.size(); i++) {
|
||||
if (!(d_ptr[i] == nullptr)) {
|
||||
safe_cuda(cudaSetDevice(_device_idx[i]));
|
||||
safe_cuda(cudaFree(d_ptr[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t size() { return _size; }
|
||||
// returns sum of bytes for all allocations
|
||||
size_t size() {
|
||||
return std::accumulate(_size.begin(), _size.end(), static_cast<size_t>(0));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void allocate(Args... args) {
|
||||
if (d_ptr != NULL) {
|
||||
throw std::runtime_error("Bulk allocator already allocated");
|
||||
}
|
||||
_size = get_size_bytes(args...);
|
||||
safe_cuda(cudaMalloc(&d_ptr, _size));
|
||||
allocate_dvec(d_ptr, args...);
|
||||
void allocate(int device_idx, Args... args) {
|
||||
size_t size = get_size_bytes(args...);
|
||||
|
||||
char *ptr = allocate_device(device_idx, size, MemoryT);
|
||||
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
|
||||
d_ptr.push_back(ptr);
|
||||
_size.push_back(size);
|
||||
_device_idx.push_back(device_idx);
|
||||
}
|
||||
};
|
||||
|
||||
@ -455,19 +569,14 @@ struct CubMemory {
|
||||
bool IsAllocated() { return d_temp_storage != NULL; }
|
||||
};
|
||||
|
||||
inline size_t available_memory() {
|
||||
inline size_t available_memory(int device_idx) {
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
safe_cuda(cudaMemGetInfo(&device_free, &device_total));
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
|
||||
return device_free;
|
||||
}
|
||||
|
||||
inline std::string device_name() {
|
||||
cudaDeviceProp prop;
|
||||
safe_cuda(cudaGetDeviceProperties(&prop, 0));
|
||||
return std::string(prop.name);
|
||||
}
|
||||
|
||||
/*
|
||||
* Utility functions
|
||||
*/
|
||||
@ -481,7 +590,7 @@ void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, memory_type MemoryT>
|
||||
void print(const dvec<T> &v, size_t max_items = 10) {
|
||||
std::vector<T> h = v.as_vector();
|
||||
for (int i = 0; i < std::min(max_items, h.size()); i++) {
|
||||
@ -530,17 +639,46 @@ size_t size_bytes(const thrust::device_vector<T> &v) {
|
||||
*/
|
||||
|
||||
template <typename L>
|
||||
__global__ void launch_n_kernel(size_t n, L lambda) {
|
||||
for (auto i : grid_stride_range(static_cast<size_t>(0), n)) {
|
||||
__global__ void launch_n_kernel(size_t begin, size_t end, L lambda) {
|
||||
for (auto i : grid_stride_range(begin, end)) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
template <typename L>
|
||||
__global__ void launch_n_kernel(int device_idx, size_t begin, size_t end,
|
||||
L lambda) {
|
||||
for (auto i : grid_stride_range(begin, end)) {
|
||||
lambda(i, device_idx);
|
||||
}
|
||||
}
|
||||
|
||||
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
|
||||
inline void launch_n(size_t n, L lambda) {
|
||||
inline void launch_n(int device_idx, size_t n, L lambda) {
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
|
||||
#if defined(__CUDACC__)
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
|
||||
lambda);
|
||||
#endif
|
||||
}
|
||||
|
||||
// if n_devices=-1, then use all visible devices
|
||||
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
|
||||
inline void multi_launch_n(size_t n, int n_devices, L lambda) {
|
||||
n_devices = n_devices < 0 ? n_visible_devices() : n_devices;
|
||||
CHECK_LE(n_devices, n_visible_devices()) << "Number of devices requested "
|
||||
"needs to be less than equal to "
|
||||
"number of visible devices.";
|
||||
const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
|
||||
#if defined(__CUDACC__)
|
||||
n_devices = n_devices > n ? n : n_devices;
|
||||
for (int device_idx = 0; device_idx < n_devices; device_idx++) {
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
size_t begin = (n / n_devices) * device_idx;
|
||||
size_t end = std::min((n / n_devices) * (device_idx + 1), n);
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(device_idx, begin, end,
|
||||
lambda);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -168,7 +168,7 @@ void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam param,
|
||||
ArgMaxByKeyAlgo algo) {
|
||||
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
|
||||
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
|
||||
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
|
||||
switch(algo) {
|
||||
case ABK_GMEM:
|
||||
|
||||
@ -208,7 +208,7 @@ private:
|
||||
dh::dvec<gpu_gpair> tmpScanGradBuff;
|
||||
dh::dvec<int> tmpScanKeyBuff;
|
||||
dh::dvec<int> colIds;
|
||||
dh::bulk_allocator ba;
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
|
||||
void findSplit(int level, node_id_t nodeStart, int nNodes) {
|
||||
reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
|
||||
@ -226,7 +226,8 @@ private:
|
||||
|
||||
void allocateAllData(int offsetSize) {
|
||||
int tmpBuffSize = scanTempBufferSize(nVals);
|
||||
ba.allocate(&vals, nVals,
|
||||
ba.allocate(param.gpu_id,
|
||||
&vals, nVals,
|
||||
&vals_cached, nVals,
|
||||
&instIds, nVals,
|
||||
&instIds_cached, nVals,
|
||||
@ -245,7 +246,7 @@ private:
|
||||
}
|
||||
|
||||
void setupOneTimeData(DMatrix& hMat) {
|
||||
size_t free_memory = dh::available_memory();
|
||||
size_t free_memory = dh::available_memory(param.gpu_id);
|
||||
if (!hMat.SingleColBlock()) {
|
||||
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
||||
}
|
||||
@ -258,7 +259,7 @@ private:
|
||||
if (!param.silent) {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||
<< free_memory / mb_size << " MB on " << dh::device_name();
|
||||
<< free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -340,7 +341,7 @@ private:
|
||||
colOffsets.data(), vals.current(),
|
||||
nVals, nCols);
|
||||
// gather the node assignments across all other columns too
|
||||
gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
|
||||
gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
|
||||
instIds.current(), nVals);
|
||||
sortKeys(level);
|
||||
}
|
||||
@ -351,7 +352,7 @@ private:
|
||||
// but we don't need more than level+1 bits for sorting!
|
||||
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
|
||||
0, level+1);
|
||||
gather<float,int>(vals.other(), vals.current(), instIds.other(),
|
||||
gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
|
||||
instIds.current(), nodeLocations.current(), nVals);
|
||||
vals.buff().selector ^= 1;
|
||||
instIds.buff().selector ^= 1;
|
||||
|
||||
@ -2,14 +2,10 @@
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "../../../src/tree/param.h"
|
||||
#include "../../../src/common/random.h"
|
||||
|
||||
#include "../../../src/tree/param.h"
|
||||
#include "types.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
|
||||
} // namespace tree
|
||||
namespace tree {} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@ -21,7 +21,8 @@ struct GPUData {
|
||||
int n_features;
|
||||
int n_instances;
|
||||
|
||||
dh::bulk_allocator ba;
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
// dh::bulk_allocator<int> ba;
|
||||
GPUTrainingParam param;
|
||||
|
||||
dh::dvec<float> fvalues;
|
||||
@ -72,24 +73,25 @@ struct GPUData {
|
||||
n_features, foffsets.data(), foffsets.data() + 1);
|
||||
|
||||
// Allocate memory
|
||||
size_t free_memory = dh::available_memory();
|
||||
ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
|
||||
&fvalues_cached, in_fvalues.size(), &foffsets,
|
||||
size_t free_memory = dh::available_memory(param_in.gpu_id);
|
||||
ba.allocate(param_in.gpu_id,
|
||||
&fvalues, in_fvalues.size(), &fvalues_temp,
|
||||
in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets,
|
||||
in_foffsets.size(), &instance_id, in_instance_id.size(),
|
||||
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
|
||||
in_instance_id.size(), &feature_id, in_feature_id.size(),
|
||||
&node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
|
||||
&node_id_instance, n_instances, &gpair, n_instances, &nodes,
|
||||
max_nodes, &split_candidates, max_nodes_level * n_features,
|
||||
&node_sums, max_nodes_level * n_features, &node_offsets,
|
||||
max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
|
||||
&sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size,
|
||||
&feature_flags, n_features, &feature_set, n_features);
|
||||
in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
|
||||
in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
|
||||
n_instances, &gpair, n_instances, &nodes, max_nodes, &split_candidates,
|
||||
max_nodes_level * n_features, &node_sums, max_nodes_level * n_features,
|
||||
&node_offsets, max_nodes_level * n_features, &sort_index_in,
|
||||
in_fvalues.size(), &sort_index_out, in_fvalues.size(), &cub_mem,
|
||||
cub_mem_size, &feature_flags, n_features, &feature_set, n_features);
|
||||
|
||||
if (!param_in.silent) {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||
<< free_memory / mb_size << " MB on " << dh::device_name();
|
||||
<< free_memory / mb_size << " MB on "
|
||||
<< dh::device_name(param_in.gpu_id);
|
||||
}
|
||||
|
||||
fvalues_cached = in_fvalues;
|
||||
@ -134,7 +136,8 @@ struct GPUData {
|
||||
auto d_node_id_instance = node_id_instance.data();
|
||||
auto d_instance_id = instance_id.data();
|
||||
|
||||
dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
|
||||
dh::launch_n(node_id.device_idx(), fvalues.size(),
|
||||
[=] __device__(bst_uint i) {
|
||||
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
|
||||
});
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <thrust/device_vector.h>
|
||||
@ -11,6 +11,14 @@
|
||||
#include "device_helpers.cuh"
|
||||
#include "types.cuh"
|
||||
|
||||
#ifndef NCCL
|
||||
#define NCCL 1
|
||||
#endif
|
||||
|
||||
#if (NCCL)
|
||||
#include "nccl.h"
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
namespace tree {
|
||||
@ -18,7 +26,8 @@ namespace tree {
|
||||
struct DeviceGMat {
|
||||
dh::dvec<int> gidx;
|
||||
dh::dvec<int> ridx;
|
||||
void Init(const common::GHistIndexMatrix &gmat);
|
||||
void Init(int device_idx, const common::GHistIndexMatrix &gmat,
|
||||
bst_uint begin, bst_uint end);
|
||||
};
|
||||
|
||||
struct HistBuilder {
|
||||
@ -31,11 +40,11 @@ struct HistBuilder {
|
||||
|
||||
struct DeviceHist {
|
||||
int n_bins;
|
||||
dh::dvec<gpu_gpair> hist;
|
||||
dh::dvec<gpu_gpair> data;
|
||||
|
||||
void Init(int max_depth);
|
||||
|
||||
void Reset();
|
||||
void Reset(int device_idx);
|
||||
|
||||
HistBuilder GetBuilder();
|
||||
|
||||
@ -64,7 +73,9 @@ class GPUHistBuilder {
|
||||
void FindSplit(int depth);
|
||||
template <int BLOCK_THREADS>
|
||||
void FindSplitSpecialize(int depth);
|
||||
void InitFirstNode();
|
||||
template <int BLOCK_THREADS>
|
||||
void LaunchFindSplit(int depth);
|
||||
void InitFirstNode(const std::vector<bst_gpair> &gpair);
|
||||
void UpdatePosition(int depth);
|
||||
void UpdatePositionDense(int depth);
|
||||
void UpdatePositionSparse(int depth);
|
||||
@ -80,32 +91,48 @@ class GPUHistBuilder {
|
||||
MetaInfo *info;
|
||||
bool initialised;
|
||||
bool is_dense;
|
||||
DeviceGMat device_matrix;
|
||||
const DMatrix *p_last_fmat_;
|
||||
|
||||
dh::bulk_allocator ba;
|
||||
dh::CubMemory cub_mem;
|
||||
dh::dvec<int> gidx_feature_map;
|
||||
dh::dvec<int> hist_node_segments;
|
||||
dh::dvec<int> feature_segments;
|
||||
dh::dvec<float> gain;
|
||||
dh::dvec<NodeIdT> position;
|
||||
dh::dvec<NodeIdT> position_tmp;
|
||||
dh::dvec<float> gidx_fvalue_map;
|
||||
dh::dvec<float> fidx_min_map;
|
||||
DeviceHist hist;
|
||||
dh::dvec<cub::KeyValuePair<int, float>> argmax;
|
||||
dh::dvec<gpu_gpair> node_sums;
|
||||
dh::dvec<gpu_gpair> hist_scan;
|
||||
dh::dvec<gpu_gpair> device_gpair;
|
||||
dh::dvec<Node> nodes;
|
||||
dh::dvec<int> feature_flags;
|
||||
dh::dvec<bool> left_child_smallest;
|
||||
dh::dvec<bst_float> prediction_cache;
|
||||
bool prediction_cache_initialised;
|
||||
|
||||
// choose which memory type to use (DEVICE or DEVICE_MANAGED)
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
// dh::bulk_allocator<dh::memory_type::DEVICE_MANAGED> ba; // can't be used
|
||||
// with NCCL
|
||||
dh::CubMemory cub_mem;
|
||||
|
||||
std::vector<int> feature_set_tree;
|
||||
std::vector<int> feature_set_level;
|
||||
|
||||
bst_uint num_rows;
|
||||
int n_devices;
|
||||
|
||||
// below vectors are for each devices used
|
||||
std::vector<int> dList;
|
||||
std::vector<int> device_row_segments;
|
||||
std::vector<int> device_element_segments;
|
||||
|
||||
std::vector<DeviceHist> hist_vec;
|
||||
std::vector<dh::dvec<Node>> nodes;
|
||||
std::vector<dh::dvec<Node>> nodes_temp;
|
||||
std::vector<dh::dvec<Node>> nodes_child_temp;
|
||||
std::vector<dh::dvec<bool>> left_child_smallest;
|
||||
std::vector<dh::dvec<bool>> left_child_smallest_temp;
|
||||
std::vector<dh::dvec<int>> feature_flags;
|
||||
std::vector<dh::dvec<float>> fidx_min_map;
|
||||
std::vector<dh::dvec<int>> feature_segments;
|
||||
std::vector<dh::dvec<bst_float>> prediction_cache;
|
||||
std::vector<dh::dvec<NodeIdT>> position;
|
||||
std::vector<dh::dvec<NodeIdT>> position_tmp;
|
||||
std::vector<DeviceGMat> device_matrix;
|
||||
std::vector<dh::dvec<gpu_gpair>> device_gpair;
|
||||
std::vector<dh::dvec<int>> gidx_feature_map;
|
||||
std::vector<dh::dvec<float>> gidx_fvalue_map;
|
||||
|
||||
std::vector<cudaStream_t *> streams;
|
||||
#if (NCCL)
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<std::vector<ncclComm_t>> find_split_comms;
|
||||
#endif
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory Mitchell
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <vector>
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
from __future__ import print_function
|
||||
#pylint: skip-file
|
||||
import sys
|
||||
sys.path.append("../../tests/python")
|
||||
@ -12,6 +13,10 @@ dpath = '../../demo/data/'
|
||||
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
|
||||
def eprint(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs)
|
||||
print(*args, file=sys.stdout, **kwargs)
|
||||
|
||||
|
||||
class TestGPU(unittest.TestCase):
|
||||
def test_grow_gpu(self):
|
||||
@ -58,7 +63,7 @@ class TestGPU(unittest.TestCase):
|
||||
'max_depth': 3,
|
||||
'eval_metric': 'auc'}
|
||||
res = {}
|
||||
xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
|
||||
xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
|
||||
evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
assert self.non_decreasing(res['test']['auc'])
|
||||
@ -74,13 +79,13 @@ class TestGPU(unittest.TestCase):
|
||||
'max_depth': 2,
|
||||
'eval_metric': 'auc'}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
|
||||
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
|
||||
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
for j in range(X2.shape[1]):
|
||||
for i in rng.choice(X2.shape[0], size=10, replace=False):
|
||||
for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
|
||||
X2[i, j] = 2
|
||||
|
||||
dtrain3 = xgb.DMatrix(X2, label=y2)
|
||||
@ -92,17 +97,18 @@ class TestGPU(unittest.TestCase):
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
for j in range(X2.shape[1]):
|
||||
for i in np.random.choice(X2.shape[0], size=10, replace=False):
|
||||
for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
|
||||
X2[i, j] = 3
|
||||
|
||||
dtrain4 = xgb.DMatrix(X2, label=y2)
|
||||
res = {}
|
||||
xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
|
||||
xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
|
||||
def test_grow_gpu_hist(self):
|
||||
n_gpus=-1
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_digits
|
||||
try:
|
||||
@ -110,31 +116,59 @@ class TestGPU(unittest.TestCase):
|
||||
except:
|
||||
from sklearn.cross_validation import train_test_split
|
||||
|
||||
for max_depth in range(3,10): # TODO: Doesn't work with 2 for some tests
|
||||
#eprint("max_depth=%d" % (max_depth))
|
||||
|
||||
for max_bin_i in range(3,11):
|
||||
max_bin = np.power(2,max_bin_i)
|
||||
#eprint("max_bin=%d" % (max_bin))
|
||||
|
||||
|
||||
|
||||
# regression test --- hist must be same as exact on all-categorial data
|
||||
ag_param = {'max_depth': 2,
|
||||
ag_param = {'max_depth': max_depth,
|
||||
'tree_method': 'exact',
|
||||
'nthread': 1,
|
||||
'eta': 1,
|
||||
'silent': 1,
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'auc'}
|
||||
ag_param2 = {'max_depth': 2,
|
||||
ag_param2 = {'max_depth': max_depth,
|
||||
'updater': 'grow_gpu_hist',
|
||||
'eta': 1,
|
||||
'silent': 1,
|
||||
'n_gpus': 1,
|
||||
'objective': 'binary:logistic',
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
ag_param3 = {'max_depth': max_depth,
|
||||
'updater': 'grow_gpu_hist',
|
||||
'eta': 1,
|
||||
'silent': 1,
|
||||
'n_gpus': n_gpus,
|
||||
'objective': 'binary:logistic',
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
ag_res = {}
|
||||
ag_res2 = {}
|
||||
ag_res3 = {}
|
||||
|
||||
num_rounds = 10
|
||||
#eprint("normal updater");
|
||||
xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_res)
|
||||
#eprint("grow_gpu_hist updater 1 gpu");
|
||||
xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_res2)
|
||||
#eprint("grow_gpu_hist updater %d gpus" % (n_gpus));
|
||||
xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_res3)
|
||||
# assert 1==0
|
||||
assert ag_res['train']['auc'] == ag_res2['train']['auc']
|
||||
assert ag_res['test']['auc'] == ag_res2['test']['auc']
|
||||
assert ag_res['test']['auc'] == ag_res3['test']['auc']
|
||||
|
||||
######################################################################
|
||||
digits = load_digits(2)
|
||||
X = digits['data']
|
||||
y = digits['target']
|
||||
@ -144,14 +178,32 @@ class TestGPU(unittest.TestCase):
|
||||
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': 3,
|
||||
'max_depth': max_depth,
|
||||
'n_gpus': 1,
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
res = {}
|
||||
xgb.train(param, dtrain, 10, [(dtrain, 'train'), (dtest, 'test')],
|
||||
#eprint("digits: grow_gpu_hist updater 1 gpu");
|
||||
xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
|
||||
evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
assert self.non_decreasing(res['test']['auc'])
|
||||
#assert self.non_decreasing(res['test']['auc'])
|
||||
param2 = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': max_depth,
|
||||
'n_gpus': n_gpus,
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
res2 = {}
|
||||
#eprint("digits: grow_gpu_hist updater %d gpus" % (n_gpus));
|
||||
xgb.train(param2, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
|
||||
evals_result=res2)
|
||||
assert self.non_decreasing(res2['train']['auc'])
|
||||
#assert self.non_decreasing(res2['test']['auc'])
|
||||
assert res['train']['auc'] == res2['train']['auc']
|
||||
#assert res['test']['auc'] == res2['test']['auc']
|
||||
|
||||
######################################################################
|
||||
# fail-safe test for dense data
|
||||
from sklearn.datasets import load_svmlight_file
|
||||
X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
|
||||
@ -160,16 +212,19 @@ class TestGPU(unittest.TestCase):
|
||||
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': 2,
|
||||
'max_depth': max_depth,
|
||||
'n_gpus': n_gpus,
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
|
||||
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
|
||||
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
if max_bin>32:
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
for j in range(X2.shape[1]):
|
||||
for i in rng.choice(X2.shape[0], size=10, replace=False):
|
||||
for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
|
||||
X2[i, j] = 2
|
||||
|
||||
dtrain3 = xgb.DMatrix(X2, label=y2)
|
||||
@ -178,54 +233,63 @@ class TestGPU(unittest.TestCase):
|
||||
xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
|
||||
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
if max_bin>32:
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
for j in range(X2.shape[1]):
|
||||
for i in np.random.choice(X2.shape[0], size=10, replace=False):
|
||||
for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
|
||||
X2[i, j] = 3
|
||||
|
||||
dtrain4 = xgb.DMatrix(X2, label=y2)
|
||||
res = {}
|
||||
xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
|
||||
xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
if max_bin>32:
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
######################################################################
|
||||
# fail-safe test for max_bin
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': max_depth,
|
||||
'n_gpus': n_gpus,
|
||||
'eval_metric': 'auc',
|
||||
'max_bin': max_bin}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
if max_bin>32:
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
######################################################################
|
||||
# subsampling
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': max_depth,
|
||||
'n_gpus': n_gpus,
|
||||
'eval_metric': 'auc',
|
||||
'colsample_bytree': 0.5,
|
||||
'colsample_bylevel': 0.5,
|
||||
'subsample': 0.5,
|
||||
'max_bin': max_bin}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
if max_bin>32:
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
######################################################################
|
||||
# fail-safe test for max_bin=2
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': 2,
|
||||
'n_gpus': n_gpus,
|
||||
'eval_metric': 'auc',
|
||||
'max_bin': 2}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
|
||||
xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
if max_bin>32:
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
# subsampling
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': 3,
|
||||
'eval_metric': 'auc',
|
||||
'colsample_bytree': 0.5,
|
||||
'colsample_bylevel': 0.5,
|
||||
'subsample': 0.5
|
||||
}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
# max_bin = 2048
|
||||
param = {'objective': 'binary:logistic',
|
||||
'updater': 'grow_gpu_hist',
|
||||
'max_depth': 3,
|
||||
'eval_metric': 'auc',
|
||||
'max_bin': 2048
|
||||
}
|
||||
res = {}
|
||||
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
|
||||
assert self.non_decreasing(res['train']['auc'])
|
||||
assert res['train']['auc'][0] >= 0.85
|
||||
|
||||
def non_decreasing(self, L):
|
||||
return all((x - y) < 0.001 for x, y in zip(L, L[1:]))
|
||||
|
||||
@ -81,6 +81,8 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
||||
std::vector<int> monotone_constraints;
|
||||
// gpu to use for single gpu algorithms
|
||||
int gpu_id;
|
||||
// number of GPUs to use
|
||||
int n_gpus;
|
||||
// declare the parameters
|
||||
DMLC_DECLARE_PARAMETER(TrainParam) {
|
||||
DMLC_DECLARE_FIELD(learning_rate)
|
||||
@ -192,6 +194,10 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
||||
.set_lower_bound(0)
|
||||
.set_default(0)
|
||||
.describe("gpu to use for single gpu algorithms");
|
||||
DMLC_DECLARE_FIELD(n_gpus)
|
||||
.set_lower_bound(-1)
|
||||
.set_default(-1)
|
||||
.describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
|
||||
// add alias of parameters
|
||||
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
|
||||
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user