Compare commits
183 Commits
v2.0.3
...
release_2.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1ce5029a96 | ||
|
|
420f8d6fde | ||
|
|
dc7ee041cc | ||
|
|
7dc152450e | ||
|
|
82d81bca94 | ||
|
|
6ec5cf26fc | ||
|
|
1ec57fd1a3 | ||
|
|
d0774a78e4 | ||
|
|
8d160a206e | ||
|
|
a41bc0975c | ||
|
|
782b73f2bb | ||
|
|
2e7e9d3b2d | ||
|
|
3e0c7d1dee | ||
|
|
2f47a1ebe6 | ||
|
|
5ca7daaa13 | ||
|
|
5f78360949 | ||
|
|
35cde3b1b2 | ||
|
|
ce345c30a8 | ||
|
|
af8845405a | ||
|
|
9ee1852d4e | ||
|
|
6ecd7903f2 | ||
|
|
42867a4805 | ||
|
|
c5b575e00e | ||
|
|
1354138b7d | ||
|
|
b994a38b28 | ||
|
|
3a834c4992 | ||
|
|
b22644fc10 | ||
|
|
7663d47383 | ||
|
|
88fc8badfa | ||
|
|
8cad8c693c | ||
|
|
b066accad6 | ||
|
|
b324d51f14 | ||
|
|
65097212b3 | ||
|
|
4a24ca2f95 | ||
|
|
83e6fceb5c | ||
|
|
e4538cb13c | ||
|
|
5446c501af | ||
|
|
313a74b582 | ||
|
|
65d83e288f | ||
|
|
f645cf51c1 | ||
|
|
db8420225b | ||
|
|
843fdde61b | ||
|
|
08bc4b0c0f | ||
|
|
6825d986fd | ||
|
|
d155ec77f9 | ||
|
|
991738690f | ||
|
|
aeb3fd1c95 | ||
|
|
141a062e00 | ||
|
|
acad01afc9 | ||
|
|
f289e5001d | ||
|
|
06d9b998ce | ||
|
|
c50cc424bc | ||
|
|
8c77e936d1 | ||
|
|
18034a4291 | ||
|
|
7ee4734d3a | ||
|
|
ee582f03c3 | ||
|
|
f3286bac04 | ||
|
|
3ee3bea683 | ||
|
|
5098735698 | ||
|
|
e74b3bbf3c | ||
|
|
22525c002a | ||
|
|
80961039d7 | ||
|
|
1474789787 | ||
|
|
1dc138404a | ||
|
|
e1d050f64e | ||
|
|
7fbc561e17 | ||
|
|
d97be6f396 | ||
|
|
f1211cffca | ||
|
|
e0716afabf | ||
|
|
595cd81251 | ||
|
|
0325ce0bed | ||
|
|
a79a35c22c | ||
|
|
4484c7f073 | ||
|
|
8207015e48 | ||
|
|
364df7db0f | ||
|
|
a2bab03205 | ||
|
|
b71c1b50de | ||
|
|
fa2336fcfd | ||
|
|
7d96758382 | ||
|
|
b0dacc5a80 | ||
|
|
f64152bf97 | ||
|
|
b4dbe7a649 | ||
|
|
e5b6219a84 | ||
|
|
3a07b1edf8 | ||
|
|
9bf16a2ca6 | ||
|
|
332f6a89a9 | ||
|
|
204d0c9a53 | ||
|
|
e961016e71 | ||
|
|
f0b8c02f15 | ||
|
|
5e8b1842b9 | ||
|
|
9f072b50ba | ||
|
|
e1ddb5ae58 | ||
|
|
643e2a7b39 | ||
|
|
bde3107c3e | ||
|
|
5edfc1e2e9 | ||
|
|
c073417d0c | ||
|
|
9bbbeb3f03 | ||
|
|
4bde2e3412 | ||
|
|
58a9fe07b6 | ||
|
|
41407850d5 | ||
|
|
968a1db4c0 | ||
|
|
ad710e4888 | ||
|
|
4e3c699814 | ||
|
|
757de84398 | ||
|
|
d27f9dfdce | ||
|
|
14cc438a64 | ||
|
|
911a5d8a60 | ||
|
|
54b076b40f | ||
|
|
91a5ef762e | ||
|
|
8fd2af1c8b | ||
|
|
bb6adda8a3 | ||
|
|
a76ccff390 | ||
|
|
61c0b19331 | ||
|
|
fa9f69dd85 | ||
|
|
080fc35c4b | ||
|
|
ccce4cf7e1 | ||
|
|
713ab9e1a0 | ||
|
|
134cbfddbe | ||
|
|
6e2c5be83e | ||
|
|
185dbce21f | ||
|
|
49732359ef | ||
|
|
ec9f500a49 | ||
|
|
53244bef6f | ||
|
|
f0febfbcac | ||
|
|
1c58ff61d1 | ||
|
|
1530c03f7d | ||
|
|
309268de02 | ||
|
|
500428cc0f | ||
|
|
495816f694 | ||
|
|
df42dd2c53 | ||
|
|
f55243fda0 | ||
|
|
1e09c21456 | ||
|
|
0ed5d3c849 | ||
|
|
f67e7de7ef | ||
|
|
5044713388 | ||
|
|
c875f0425f | ||
|
|
4fd08b6c32 | ||
|
|
b9d86d44d6 | ||
|
|
a56055225a | ||
|
|
6eba0a56ec | ||
|
|
00c24a58b1 | ||
|
|
6fa248b75f | ||
|
|
946f9e9802 | ||
|
|
4c4e5af29c | ||
|
|
7e1b06417b | ||
|
|
cdd7794641 | ||
|
|
cd743a1ae9 | ||
|
|
a45005863b | ||
|
|
bdcb036592 | ||
|
|
7a3a9b682a | ||
|
|
0a711662c3 | ||
|
|
312e58ec99 | ||
|
|
ca8f4e7993 | ||
|
|
60795f22de | ||
|
|
05fdca893f | ||
|
|
d8cc93f3f2 | ||
|
|
62c4efac51 | ||
|
|
ba9e00d911 | ||
|
|
d3be67ad8e | ||
|
|
2eb0b6aae4 | ||
|
|
327f1494f1 | ||
|
|
fa92aa56ee | ||
|
|
427f6c2a1a | ||
|
|
270c7b4802 | ||
|
|
0fc1f640a9 | ||
|
|
762fd9028d | ||
|
|
f2009533e1 | ||
|
|
53b5cd73f2 | ||
|
|
52b05d934e | ||
|
|
840f15209c | ||
|
|
1e1c7fd8d5 | ||
|
|
f5f800c80d | ||
|
|
6b7be96373 | ||
|
|
75712b9c3c | ||
|
|
ed45aa2816 | ||
|
|
f286ae5bfa | ||
|
|
f13a7f8d91 | ||
|
|
c51a1c9aae | ||
|
|
30de728631 | ||
|
|
75fa15b36d | ||
|
|
eb30cb6293 | ||
|
|
cafbfce51f | ||
|
|
6039a71e6c |
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -5,3 +5,6 @@
|
||||
[submodule "gputreeshap"]
|
||||
path = gputreeshap
|
||||
url = https://github.com/rapidsai/gputreeshap.git
|
||||
[submodule "rocgputreeshap"]
|
||||
path = rocgputreeshap
|
||||
url = https://github.com/ROCmSoftwarePlatform/rocgputreeshap
|
||||
|
||||
@@ -58,7 +58,7 @@ option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC
|
||||
option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
|
||||
option(GOOGLE_TEST "Build google tests" OFF)
|
||||
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
|
||||
option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF)
|
||||
option(USE_DEVICE_DEBUG "Generate CUDA/HIP device debug info." OFF)
|
||||
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
|
||||
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
|
||||
option(RABIT_MOCK "Build rabit with mock" OFF)
|
||||
@@ -71,6 +71,10 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
||||
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
||||
set(GPU_COMPUTE_VER "" CACHE STRING
|
||||
"Semicolon separated list of compute versions to be built against, e.g. '35;61'")
|
||||
## HIP
|
||||
option(USE_HIP "Build with GPU acceleration" OFF)
|
||||
option(USE_RCCL "Build with RCCL to enable distributed GPU support." OFF)
|
||||
option(BUILD_WITH_SHARED_RCCL "Build with shared RCCL library." OFF)
|
||||
## Copied From dmlc
|
||||
option(USE_HDFS "Build with HDFS support" OFF)
|
||||
option(USE_AZURE "Build with AZURE support" OFF)
|
||||
@@ -93,6 +97,7 @@ option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
|
||||
if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
|
||||
message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
|
||||
endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
|
||||
|
||||
if (USE_NCCL AND NOT (USE_CUDA))
|
||||
message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
|
||||
endif (USE_NCCL AND NOT (USE_CUDA))
|
||||
@@ -102,6 +107,17 @@ endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
|
||||
if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
|
||||
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
|
||||
endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
|
||||
|
||||
if (USE_RCCL AND NOT (USE_HIP))
|
||||
message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
|
||||
endif (USE_RCCL AND NOT (USE_HIP))
|
||||
if (USE_DEVICE_DEBUG AND NOT (USE_HIP))
|
||||
message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_HIP` flag.")
|
||||
endif (USE_DEVICE_DEBUG AND NOT (USE_HIP))
|
||||
if (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
|
||||
message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
|
||||
endif (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
|
||||
|
||||
if (JVM_BINDINGS AND R_LIB)
|
||||
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
|
||||
endif (JVM_BINDINGS AND R_LIB)
|
||||
@@ -115,9 +131,15 @@ endif (USE_AVX)
|
||||
if (PLUGIN_LZ4)
|
||||
message(SEND_ERROR "The option 'PLUGIN_LZ4' is removed from XGBoost.")
|
||||
endif (PLUGIN_LZ4)
|
||||
|
||||
if (PLUGIN_RMM AND NOT (USE_CUDA))
|
||||
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
|
||||
endif (PLUGIN_RMM AND NOT (USE_CUDA))
|
||||
|
||||
if (PLUGIN_RMM AND NOT (USE_HIP))
|
||||
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_HIP` flag.")
|
||||
endif (PLUGIN_RMM AND NOT (USE_HIP))
|
||||
|
||||
if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
|
||||
message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
|
||||
endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
|
||||
@@ -170,6 +192,24 @@ if (USE_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (USE_HIP)
|
||||
set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
|
||||
# `export CXX=' is ignored by CMake HIP.
|
||||
set(CMAKE_HIP_HOST_COMPILER ${CMAKE_CXX_COMPILER})
|
||||
message(STATUS "Configured HIP host compiler: ${CMAKE_HIP_HOST_COMPILER}")
|
||||
|
||||
enable_language(HIP)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(rocthrust REQUIRED)
|
||||
find_package(hipcub REQUIRED)
|
||||
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
|
||||
endif (USE_HIP)
|
||||
|
||||
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
||||
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
|
||||
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
|
||||
@@ -209,6 +249,10 @@ if (USE_NCCL)
|
||||
find_package(Nccl REQUIRED)
|
||||
endif (USE_NCCL)
|
||||
|
||||
if (USE_RCCL)
|
||||
find_package(rccl REQUIRED)
|
||||
endif (USE_RCCL)
|
||||
|
||||
# dmlc-core
|
||||
msvc_use_static_runtime()
|
||||
if (FORCE_SHARED_CRT)
|
||||
|
||||
@@ -32,7 +32,7 @@ namespace common {
|
||||
bool CheckNAN(double v) {
|
||||
return ISNAN(v);
|
||||
}
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
double LogGamma(double v) {
|
||||
return lgammafn(v);
|
||||
}
|
||||
|
||||
@@ -181,6 +181,23 @@ function(xgboost_set_cuda_flags target)
|
||||
CUDA_RUNTIME_LIBRARY Static)
|
||||
endfunction(xgboost_set_cuda_flags)
|
||||
|
||||
# Set HIP related flags to target.
|
||||
function(xgboost_set_hip_flags target)
|
||||
if (USE_DEVICE_DEBUG)
|
||||
target_compile_options(${target} PRIVATE
|
||||
$<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:HIP>>:-G>)
|
||||
endif (USE_DEVICE_DEBUG)
|
||||
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
|
||||
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
|
||||
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)
|
||||
|
||||
set_target_properties(${target} PROPERTIES
|
||||
HIP_STANDARD 17
|
||||
HIP_STANDARD_REQUIRED ON
|
||||
HIP_SEPARABLE_COMPILATION OFF)
|
||||
endfunction(xgboost_set_hip_flags)
|
||||
|
||||
macro(xgboost_link_nccl target)
|
||||
if (BUILD_STATIC_LIB)
|
||||
target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
|
||||
@@ -193,6 +210,20 @@ macro(xgboost_link_nccl target)
|
||||
endif (BUILD_STATIC_LIB)
|
||||
endmacro(xgboost_link_nccl)
|
||||
|
||||
macro(xgboost_link_rccl target)
|
||||
if(BUILD_STATIC_LIB)
|
||||
target_include_directories(${target} PUBLIC ${RCCL_INCLUDE_DIR}/rccl)
|
||||
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1)
|
||||
target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
|
||||
target_link_libraries(${target} PUBLIC ${RCCL_LIBRARY})
|
||||
else()
|
||||
target_include_directories(${target} PRIVATE ${RCCL_INCLUDE_DIR}/rccl)
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1)
|
||||
target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
|
||||
target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
# compile options
|
||||
macro(xgboost_target_properties target)
|
||||
set_target_properties(${target} PROPERTIES
|
||||
@@ -215,6 +246,10 @@ macro(xgboost_target_properties target)
|
||||
-Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
|
||||
-Wall -Wextra -Wno-expansion-to-defined>
|
||||
)
|
||||
target_compile_options(${target} PUBLIC
|
||||
$<IF:$<COMPILE_LANGUAGE:HIP>,
|
||||
-Wall -Wextra >
|
||||
)
|
||||
endif(ENABLE_ALL_WARNINGS)
|
||||
|
||||
target_compile_options(${target}
|
||||
@@ -283,6 +318,10 @@ macro(xgboost_target_link_libraries target)
|
||||
target_link_libraries(${target} PUBLIC CUDA::cudart_static)
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (USE_HIP)
|
||||
xgboost_set_hip_flags(${target})
|
||||
endif (USE_HIP)
|
||||
|
||||
if (PLUGIN_RMM)
|
||||
target_link_libraries(${target} PRIVATE rmm::rmm)
|
||||
endif (PLUGIN_RMM)
|
||||
@@ -291,6 +330,10 @@ macro(xgboost_target_link_libraries target)
|
||||
xgboost_link_nccl(${target})
|
||||
endif (USE_NCCL)
|
||||
|
||||
if(USE_RCCL)
|
||||
xgboost_link_rccl(${target})
|
||||
endif()
|
||||
|
||||
if (USE_NVTX)
|
||||
target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
|
||||
endif (USE_NVTX)
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
set(USE_OPENMP @USE_OPENMP@)
|
||||
set(USE_CUDA @USE_CUDA@)
|
||||
set(USE_NCCL @USE_NCCL@)
|
||||
set(USE_HIP @USE_HIP@)
|
||||
set(USE_RCCL @USE_RCCL@)
|
||||
set(XGBOOST_BUILD_STATIC_LIB @BUILD_STATIC_LIB@)
|
||||
|
||||
include(CMakeFindDependencyMacro)
|
||||
@@ -15,6 +17,9 @@ if (XGBOOST_BUILD_STATIC_LIB)
|
||||
if(USE_CUDA)
|
||||
find_dependency(CUDA)
|
||||
endif()
|
||||
if(USE_HIP)
|
||||
find_dependency(HIP)
|
||||
endif()
|
||||
# nccl should be linked statically if xgboost is built as static library.
|
||||
endif (XGBOOST_BUILD_STATIC_LIB)
|
||||
|
||||
|
||||
@@ -4,13 +4,13 @@ python mapfeat.py
|
||||
# split train and test
|
||||
python mknfold.py machine.txt 1
|
||||
# training and output the models
|
||||
../../xgboost machine.conf
|
||||
../../../xgboost machine.conf
|
||||
# output predictions of test data
|
||||
../../xgboost machine.conf task=pred model_in=0002.model
|
||||
../../../xgboost machine.conf task=pred model_in=0002.model
|
||||
# print the boosters of 0002.model in dump.raw.txt
|
||||
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
../../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
# print the boosters of 0002.model in dump.nice.txt with feature map
|
||||
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
../../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
|
||||
# cat the result
|
||||
cat dump.nice.txt
|
||||
|
||||
@@ -58,19 +58,19 @@
|
||||
/*!
|
||||
* \brief Tag function as usable by device
|
||||
*/
|
||||
#if defined (__CUDA__) || defined(__NVCC__)
|
||||
#if defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
|
||||
#define XGBOOST_DEVICE __host__ __device__
|
||||
#else
|
||||
#define XGBOOST_DEVICE
|
||||
#endif // defined (__CUDA__) || defined(__NVCC__)
|
||||
#endif // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
|
||||
|
||||
#if defined(__CUDA__) || defined(__CUDACC__)
|
||||
#if defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
|
||||
#define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
|
||||
#define XGBOOST_DEV_INLINE __device__ __forceinline__
|
||||
#else
|
||||
#define XGBOOST_HOST_DEV_INLINE
|
||||
#define XGBOOST_DEV_INLINE
|
||||
#endif // defined(__CUDA__) || defined(__CUDACC__)
|
||||
#endif // defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
|
||||
|
||||
// These check are for Makefile.
|
||||
#if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
|
||||
@@ -274,8 +274,8 @@ class GradientPairInt64 {
|
||||
GradientPairInt64(GradientPairInt64 const &g) = default;
|
||||
GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;
|
||||
|
||||
XGBOOST_DEVICE [[nodiscard]] T GetQuantisedGrad() const { return grad_; }
|
||||
XGBOOST_DEVICE [[nodiscard]] T GetQuantisedHess() const { return hess_; }
|
||||
[[nodiscard]] XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
|
||||
[[nodiscard]] XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }
|
||||
|
||||
XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
|
||||
grad_ += rhs.grad_;
|
||||
|
||||
@@ -58,11 +58,11 @@
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
// Sets a function to call instead of cudaSetDevice();
|
||||
// only added for testing
|
||||
void SetCudaSetDeviceHandler(void (*handler)(int));
|
||||
#endif // __CUDACC__
|
||||
#endif // __CUDACC__ || __HIPCC__
|
||||
|
||||
template <typename T> struct HostDeviceVectorImpl;
|
||||
|
||||
|
||||
@@ -30,11 +30,11 @@
|
||||
|
||||
// decouple it from xgboost.
|
||||
#ifndef LINALG_HD
|
||||
#if defined(__CUDA__) || defined(__NVCC__)
|
||||
#if defined(__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
|
||||
#define LINALG_HD __host__ __device__
|
||||
#else
|
||||
#define LINALG_HD
|
||||
#endif // defined (__CUDA__) || defined(__NVCC__)
|
||||
#endif // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
|
||||
#endif // LINALG_HD
|
||||
|
||||
namespace xgboost::linalg {
|
||||
@@ -118,7 +118,7 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,
|
||||
|
||||
template <int32_t n, typename Fn>
|
||||
LINALG_HD constexpr auto UnrollLoop(Fn fn) {
|
||||
#if defined __CUDA_ARCH__
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
#pragma unroll n
|
||||
#endif // defined __CUDA_ARCH__
|
||||
for (int32_t i = 0; i < n; ++i) {
|
||||
@@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
|
||||
inline LINALG_HD int Popc(uint32_t v) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return __popc(v);
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
|
||||
return __builtin_popcount(v);
|
||||
#elif defined(_MSC_VER)
|
||||
return __popcnt(v);
|
||||
@@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
|
||||
inline LINALG_HD int Popc(uint64_t v) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return __popcll(v);
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
|
||||
return __builtin_popcountll(v);
|
||||
#elif defined(_MSC_VER) && defined(_M_X64)
|
||||
return __popcnt64(v);
|
||||
|
||||
@@ -41,7 +41,9 @@
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#include <cuda_runtime.h>
|
||||
#endif // defined(__CUDACC__)
|
||||
#elif defined(__HIPCC__)
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
/*!
|
||||
* The version number 1910 is picked up from GSL.
|
||||
@@ -104,7 +106,42 @@ namespace common {
|
||||
|
||||
#define SPAN_CHECK KERNEL_CHECK
|
||||
|
||||
#else // ------------------------------ not CUDA ----------------------------
|
||||
#elif defined(__HIPCC__)
|
||||
// Usual logging facility is not available inside device code.
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
// Windows HIP doesn't have __assert_fail.
|
||||
#define HIP_KERNEL_CHECK(cond) \
|
||||
do { \
|
||||
if (XGBOOST_EXPECT(!(cond), false)) { \
|
||||
__builtin_trap(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#else // defined(_MSC_VER)
|
||||
|
||||
#define __ASSERT_STR_HELPER(x) #x
|
||||
|
||||
#if 0
|
||||
#define HIP_KERNEL_CHECK(cond) \
|
||||
(XGBOOST_EXPECT((cond), true) \
|
||||
? static_cast<void>(0) \
|
||||
: __assert_fail(__ASSERT_STR_HELPER((cond)), __FILE__, __LINE__, __PRETTY_FUNCTION__))
|
||||
#else
|
||||
#define HIP_KERNEL_CHECK(cond) \
|
||||
(XGBOOST_EXPECT((cond), true) \
|
||||
? static_cast<void>(0) \
|
||||
: __builtin_trap())
|
||||
#endif
|
||||
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
#define KERNEL_CHECK HIP_KERNEL_CHECK
|
||||
|
||||
#define SPAN_CHECK KERNEL_CHECK
|
||||
|
||||
#else // ------------------------------ not CUDA or HIP ----------------------------
|
||||
|
||||
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
|
||||
|
||||
@@ -120,7 +157,7 @@ namespace common {
|
||||
|
||||
#endif // defined(XGBOOST_STRICT_R_MODE)
|
||||
|
||||
#endif // __CUDA_ARCH__
|
||||
#endif // __CUDA_ARCH__ || __HIPCC__
|
||||
|
||||
#define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs))
|
||||
|
||||
@@ -317,7 +354,7 @@ struct IsSpanOracle<Span<T, Extent>> : std::true_type {};
|
||||
template <class T>
|
||||
struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {};
|
||||
|
||||
// Re-implement std algorithms here to adopt CUDA.
|
||||
// Re-implement std algorithms here to adopt CUDA/HIP
|
||||
template <typename T>
|
||||
struct Less {
|
||||
XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const {
|
||||
|
||||
@@ -9,6 +9,11 @@ if (USE_CUDA)
|
||||
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (USE_HIP)
|
||||
list(APPEND JVM_SOURCES
|
||||
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip)
|
||||
endif (USE_HIP)
|
||||
|
||||
add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
|
||||
|
||||
if (ENABLE_ALL_WARNINGS)
|
||||
|
||||
@@ -21,6 +21,8 @@ CONFIG = {
|
||||
"USE_S3": "OFF",
|
||||
"USE_CUDA": "OFF",
|
||||
"USE_NCCL": "OFF",
|
||||
"USE_HIP": "OFF",
|
||||
"USE_RCCL": "OFF",
|
||||
"JVM_BINDINGS": "ON",
|
||||
"LOG_CAPI_INVOCATION": "OFF",
|
||||
}
|
||||
@@ -79,7 +81,7 @@ def native_build(args):
|
||||
|
||||
print("building Java wrapper")
|
||||
with cd(".."):
|
||||
build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
|
||||
build_dir = "build-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "build"
|
||||
maybe_makedirs(build_dir)
|
||||
with cd(build_dir):
|
||||
if sys.platform == "win32":
|
||||
@@ -98,6 +100,9 @@ def native_build(args):
|
||||
if cli_args.use_cuda == "ON":
|
||||
CONFIG["USE_CUDA"] = "ON"
|
||||
CONFIG["USE_NCCL"] = "ON"
|
||||
elif cli_args.use_hip == "ON":
|
||||
CONFIG["USE_HIP"] = "ON"
|
||||
CONFIG["USE_RCCL"] = "ON"
|
||||
|
||||
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
|
||||
|
||||
@@ -120,9 +125,9 @@ def native_build(args):
|
||||
run(f'"{sys.executable}" mapfeat.py')
|
||||
run(f'"{sys.executable}" mknfold.py machine.txt 1')
|
||||
|
||||
xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
|
||||
xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "xgboost4j"
|
||||
xgboost4j_spark = (
|
||||
"xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
|
||||
"xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "xgboost4j-spark"
|
||||
)
|
||||
|
||||
print("copying native library")
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
<maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
|
||||
<log.capi.invocation>OFF</log.capi.invocation>
|
||||
<use.cuda>OFF</use.cuda>
|
||||
<use.hip>OFF</use.hip>
|
||||
<cudf.version>23.08.0</cudf.version>
|
||||
<spark.rapids.version>23.08.0</spark.rapids.version>
|
||||
<cudf.classifier>cuda11</cudf.classifier>
|
||||
|
||||
@@ -104,6 +104,8 @@
|
||||
<argument>${log.capi.invocation}</argument>
|
||||
<argument>--use-cuda</argument>
|
||||
<argument>${use.cuda}</argument>
|
||||
<argument>--use-hip</argument>
|
||||
<argument>${use.hip}</argument>
|
||||
</arguments>
|
||||
<workingDirectory>${user.dir}</workingDirectory>
|
||||
</configuration>
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Created by bobwang on 2021/9/8.
|
||||
//
|
||||
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
#include <jni.h>
|
||||
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#include <jni.h>
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include "../../../../src/common/device_helpers.cuh"
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
#include "../../../../src/common/device_helpers.hip.h"
|
||||
#endif
|
||||
#include "../../../../src/common/cuda_pinned_allocator.h"
|
||||
#include "../../../../src/data/array_interface.h"
|
||||
#include "jvm_utils.h"
|
||||
|
||||
4
jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip
Normal file
4
jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "xgboost4j-gpu.cu"
|
||||
#endif
|
||||
@@ -15,6 +15,10 @@ class BuildConfiguration: # pylint: disable=R0902
|
||||
use_cuda: bool = False
|
||||
# Whether to enable NCCL
|
||||
use_nccl: bool = False
|
||||
# Whether to enablea HIP
|
||||
use_hip: bool = False
|
||||
# Whether to enable RCCL
|
||||
use_rccl: bool = False
|
||||
# Whether to enable HDFS
|
||||
use_hdfs: bool = False
|
||||
# Whether to enable Azure Storage
|
||||
|
||||
1
rocgputreeshap
Submodule
1
rocgputreeshap
Submodule
Submodule rocgputreeshap added at 2fea6734e8
@@ -16,6 +16,11 @@ if (USE_CUDA)
|
||||
target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (USE_HIP)
|
||||
file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
|
||||
target_sources(objxgboost PRIVATE ${HIP_SOURCES})
|
||||
endif (USE_HIP)
|
||||
|
||||
target_include_directories(objxgboost
|
||||
PRIVATE
|
||||
${xgboost_SOURCE_DIR}/include
|
||||
@@ -33,6 +38,7 @@ msvc_use_static_runtime()
|
||||
|
||||
# This grouping organises source files nicely in visual studio
|
||||
auto_source_group("${CUDA_SOURCES}")
|
||||
auto_source_group("${HIP_SOURCES}")
|
||||
auto_source_group("${CPU_SOURCES}")
|
||||
|
||||
#-- End object library
|
||||
|
||||
@@ -70,12 +70,14 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
|
||||
|
||||
using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
namespace xgboost {
|
||||
void XGBBuildInfoDevice(Json *p_info) {
|
||||
auto &info = *p_info;
|
||||
info["USE_CUDA"] = Boolean{false};
|
||||
info["USE_NCCL"] = Boolean{false};
|
||||
info["USE_HIP"] = Boolean{false};
|
||||
info["USE_RCCL"] = Boolean{false};
|
||||
info["USE_RMM"] = Boolean{false};
|
||||
}
|
||||
} // namespace xgboost
|
||||
@@ -279,7 +281,7 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
|
||||
API_END();
|
||||
}
|
||||
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *, char const *, DMatrixHandle *) {
|
||||
API_BEGIN();
|
||||
common::AssertGPUSupport();
|
||||
@@ -1189,7 +1191,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
|
||||
API_END();
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *,
|
||||
DMatrixHandle, xgboost::bst_ulong const **,
|
||||
xgboost::bst_ulong *, const float **) {
|
||||
|
||||
@@ -17,7 +17,11 @@ namespace xgboost {
|
||||
void XGBBuildInfoDevice(Json *p_info) {
|
||||
auto &info = *p_info;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
info["USE_CUDA"] = true;
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
info["USE_HIP"] = true;
|
||||
#endif
|
||||
|
||||
std::vector<Json> v{Json{Integer{THRUST_MAJOR_VERSION}}, Json{Integer{THRUST_MINOR_VERSION}},
|
||||
Json{Integer{THRUST_SUBMINOR_VERSION}}};
|
||||
@@ -30,8 +34,13 @@ void XGBBuildInfoDevice(Json *p_info) {
|
||||
info["USE_NCCL"] = Boolean{true};
|
||||
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
|
||||
info["NCCL_VERSION"] = v;
|
||||
#elif defined(XGBOOST_USE_RCCL)
|
||||
info["USE_RCCL"] = Boolean{true};
|
||||
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
|
||||
info["RCCL_VERSION"] = v;
|
||||
#else
|
||||
info["USE_NCCL"] = Boolean{false};
|
||||
info["USE_RCCL"] = Boolean{false};
|
||||
#endif
|
||||
|
||||
#if defined(XGBOOST_USE_RMM)
|
||||
|
||||
4
src/c_api/c_api.hip
Normal file
4
src/c_api/c_api.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "c_api.cu"
|
||||
#endif
|
||||
@@ -175,7 +175,7 @@ inline float GetMissing(Json const &config) {
|
||||
|
||||
// Safe guard some global variables from being changed by XGBoost.
|
||||
class XGBoostAPIGuard {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
int32_t device_id_ {0};
|
||||
|
||||
void SetGPUAttribute();
|
||||
|
||||
@@ -51,7 +51,7 @@ void Communicator::Init(Json const& config) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
void Communicator::Finalize() {
|
||||
communicator_->Shutdown();
|
||||
communicator_.reset(new NoOpCommunicator());
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "device_communicator.cuh"
|
||||
#include "device_communicator_adapter.cuh"
|
||||
#include "noop_communicator.h"
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||
#include "nccl_device_communicator.cuh"
|
||||
#endif
|
||||
|
||||
@@ -28,7 +28,7 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
|
||||
communicator_->GetWorldSize() != old_world_size) {
|
||||
old_device_ordinal = device_ordinal;
|
||||
old_world_size = communicator_->GetWorldSize();
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||
switch (type_) {
|
||||
case CommunicatorType::kRabit:
|
||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
|
||||
|
||||
@@ -98,7 +98,7 @@ class Communicator {
|
||||
/** @brief Get the communicator instance. */
|
||||
static Communicator *Get() { return communicator_.get(); }
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
/**
|
||||
* @brief Get the device communicator.
|
||||
*
|
||||
@@ -230,7 +230,7 @@ class Communicator {
|
||||
|
||||
static thread_local std::unique_ptr<Communicator> communicator_;
|
||||
static thread_local CommunicatorType type_;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
|
||||
#endif
|
||||
|
||||
|
||||
4
src/collective/communicator.hip
Normal file
4
src/collective/communicator.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "communicator.cu"
|
||||
#endif
|
||||
@@ -41,7 +41,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
|
||||
segments->clear();
|
||||
segments->resize(world_size_, 0);
|
||||
segments->at(rank_) = length_bytes;
|
||||
@@ -60,6 +59,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
Broadcast(host_buffer_.data() + offset, as_bytes, i);
|
||||
offset += as_bytes;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*!
|
||||
* Copyright 2023 XGBoost contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||
#include "nccl_device_communicator.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -35,12 +35,22 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
|
||||
|
||||
private:
|
||||
static constexpr std::size_t kUuidLength =
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
sizeof(hipUUID) / sizeof(uint64_t);
|
||||
#endif
|
||||
|
||||
void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaDeviceProp prob{};
|
||||
dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
|
||||
std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
hipUUID id;
|
||||
hipDeviceGetUuid(&id, device_ordinal_);
|
||||
std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) {
|
||||
|
||||
4
src/collective/nccl_device_communicator.hip
Normal file
4
src/collective/nccl_device_communicator.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "nccl_device_communicator.cu"
|
||||
#endif
|
||||
@@ -10,7 +10,13 @@
|
||||
|
||||
#include <cstddef> // size_t
|
||||
#include <cstdint> // int32_t
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
#include <cub/cub.cuh> // DispatchSegmentedRadixSort,NullType,DoubleBuffer
|
||||
#endif
|
||||
|
||||
#include <iterator> // distance
|
||||
#include <limits> // numeric_limits
|
||||
#include <type_traits> // conditional_t,remove_const_t
|
||||
@@ -39,6 +45,7 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
|
||||
using OffsetT = int;
|
||||
|
||||
// Null value type
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||
cub::DoubleBuffer<cub::NullType> d_values;
|
||||
|
||||
@@ -47,6 +54,20 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
|
||||
num_segments, d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, ctx->Stream(), debug_synchronous)));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
if (IS_DESCENDING) {
|
||||
rocprim::segmented_radix_sort_pairs_desc<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
|
||||
temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
|
||||
num_segments, d_begin_offsets, d_end_offsets,
|
||||
begin_bit, end_bit, ctx->Stream(), debug_synchronous);
|
||||
}
|
||||
else {
|
||||
rocprim::segmented_radix_sort_pairs<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
|
||||
temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
|
||||
num_segments, d_begin_offsets, d_end_offsets,
|
||||
begin_bit, end_bit, ctx->Stream(), debug_synchronous);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Wrapper around cub sort for easier `descending` sort.
|
||||
@@ -60,14 +81,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
|
||||
BeginOffsetIteratorT d_begin_offsets,
|
||||
EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
|
||||
int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
|
||||
#endif
|
||||
|
||||
// In old version of cub, num_items in dispatch is also int32_t, no way to change.
|
||||
using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
|
||||
std::size_t, std::int32_t>;
|
||||
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
|
||||
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if THRUST_MAJOR_VERSION >= 2
|
||||
dh::safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
|
||||
@@ -88,6 +113,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, stream, false)));
|
||||
#endif
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
if (descending) {
|
||||
rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
||||
d_values_in, d_values_out, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream, false);
|
||||
}
|
||||
else {
|
||||
rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
||||
d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets,
|
||||
begin_bit, end_bit, stream, false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
|
||||
@@ -17,14 +17,18 @@
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include "device_helpers.cuh"
|
||||
#endif // defined(__CUDACC__)
|
||||
#elif defined(__HIPCC__)
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include "device_helpers.hip.h"
|
||||
#endif // defined(__CUDACC__) || defined(__HIPCC__)
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "common.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
using BitFieldAtomicType = unsigned long long; // NOLINT
|
||||
|
||||
__forceinline__ __device__ BitFieldAtomicType AtomicOr(BitFieldAtomicType* address,
|
||||
@@ -48,7 +52,7 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
|
||||
|
||||
return old;
|
||||
}
|
||||
#endif // defined(__CUDACC__)
|
||||
#endif // defined(__CUDACC__) || defined(__HIPCC__)
|
||||
|
||||
/**
|
||||
* @brief A non-owning type with auxiliary methods defined for manipulating bits.
|
||||
@@ -106,7 +110,7 @@ struct BitFieldContainer {
|
||||
XGBOOST_DEVICE static size_t ComputeStorageSize(index_type size) {
|
||||
return common::DivRoundUp(size, kValueSize);
|
||||
}
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t min_size = min(NumValues(), rhs.NumValues());
|
||||
@@ -123,9 +127,9 @@ struct BitFieldContainer {
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif // #if defined(__CUDA_ARCH__)
|
||||
#endif // #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
__device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
|
||||
size_t min_size = min(NumValues(), rhs.NumValues());
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -144,7 +148,7 @@ struct BitFieldContainer {
|
||||
}
|
||||
#endif // defined(__CUDA_ARCH__)
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
__device__ auto Set(index_type pos) noexcept(true) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
value_type& value = Data()[pos_v.int_pos];
|
||||
@@ -159,6 +163,14 @@ struct BitFieldContainer {
|
||||
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
|
||||
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
|
||||
}
|
||||
#ifdef __HIPCC__
|
||||
void Clear(index_type pos) noexcept(true) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
value_type& value = Data()[pos_v.int_pos];
|
||||
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
||||
value &= clear_bit;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
void Set(index_type pos) noexcept(true) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
@@ -172,7 +184,7 @@ struct BitFieldContainer {
|
||||
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
||||
value &= clear_bit;
|
||||
}
|
||||
#endif // defined(__CUDA_ARCH__)
|
||||
#endif // defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
|
||||
XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
|
||||
pos_v = Direction::Shift(pos_v);
|
||||
|
||||
@@ -55,7 +55,7 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
int AllVisibleGPUs() { return 0; }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
* Copyright 2018-2022 XGBoost contributors
|
||||
*/
|
||||
#include "common.h"
|
||||
#include "cuda_to_hip.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
@@ -26,6 +26,12 @@
|
||||
|
||||
#define WITH_CUDA() true
|
||||
|
||||
#elif defined(__HIPCC__)
|
||||
#include <thrust/system/hip/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
|
||||
#define WITH_CUDA() true
|
||||
|
||||
#else
|
||||
|
||||
#define WITH_CUDA() false
|
||||
@@ -39,8 +45,8 @@ namespace dh {
|
||||
*/
|
||||
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
|
||||
int line) {
|
||||
inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, int line)
|
||||
{
|
||||
if (code != cudaSuccess) {
|
||||
LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
|
||||
std::string{file} + ": " + // NOLINT
|
||||
@@ -48,7 +54,23 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
|
||||
}
|
||||
return code;
|
||||
}
|
||||
#endif // defined(__CUDACC__)
|
||||
|
||||
#elif defined(__HIPCC__)
|
||||
/*
|
||||
* Error handling functions
|
||||
*/
|
||||
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline hipError_t ThrowOnCudaError(hipError_t code, const char *file, int line)
|
||||
{
|
||||
if (code != hipSuccess) {
|
||||
LOG(FATAL) << thrust::system_error(code, thrust::hip_category(),
|
||||
std::string{file} + ": " + // NOLINT
|
||||
std::to_string(line)).what();
|
||||
}
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
} // namespace dh
|
||||
|
||||
namespace xgboost::common {
|
||||
@@ -159,9 +181,9 @@ class Range {
|
||||
int AllVisibleGPUs();
|
||||
|
||||
inline void AssertGPUSupport() {
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
LOG(FATAL) << "XGBoost version not compiled with GPU support.";
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
#endif // XGBOOST_USE_CUDA && XGBOOST_USE_HIP
|
||||
}
|
||||
|
||||
inline void AssertOneAPISupport() {
|
||||
@@ -172,7 +194,7 @@ inline void AssertOneAPISupport() {
|
||||
|
||||
void SetDevice(std::int32_t device);
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline void SetDevice(std::int32_t device) {
|
||||
if (device >= 0) {
|
||||
AssertGPUSupport();
|
||||
|
||||
4
src/common/common.hip
Normal file
4
src/common/common.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "common.cu"
|
||||
#endif
|
||||
@@ -11,9 +11,11 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(__CUDACC__)
|
||||
#include "device_helpers.cuh"
|
||||
#endif // __CUDACC__
|
||||
#elif defined(__HIPCC__)
|
||||
#include "device_helpers.hip.h"
|
||||
#endif // __CUDACC__ || __HIPCC__
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -105,7 +107,7 @@ class CompressedBufferWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
__device__ void AtomicWriteSymbol
|
||||
(CompressedByteT* buffer, uint64_t symbol, size_t offset) {
|
||||
size_t ibit_start = offset * symbol_bits_;
|
||||
@@ -119,7 +121,7 @@ class CompressedBufferWriter {
|
||||
symbol >>= 8;
|
||||
}
|
||||
}
|
||||
#endif // __CUDACC__
|
||||
#endif // __CUDACC__ || __HIPCC__
|
||||
|
||||
template <typename IterT>
|
||||
void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
|
||||
#define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
|
||||
#include <thrust/execution_policy.h>
|
||||
|
||||
#include "device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -72,11 +72,23 @@ class pinned_allocator {
|
||||
if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if
|
||||
|
||||
pointer result(nullptr);
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||
#else
|
||||
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT
|
||||
inline void deallocate(pointer p, size_type) {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipHostFree(p));
|
||||
#else
|
||||
dh::safe_cuda(cudaFreeHost(p));
|
||||
#endif
|
||||
} // NOLINT
|
||||
|
||||
inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT
|
||||
|
||||
|
||||
71
src/common/cuda_to_hip.h
Normal file
71
src/common/cuda_to_hip.h
Normal file
@@ -0,0 +1,71 @@
|
||||
/**
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
|
||||
#define cudaSuccess hipSuccess
|
||||
#define cudaGetLastError hipGetLastError
|
||||
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaStreamCreate hipStreamCreate
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamDestroy hipStreamDestroy
|
||||
#define cudaStreamWaitEvent hipStreamWaitEvent
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamPerThread hipStreamPerThread
|
||||
#define cudaStreamLegacy hipStreamLegacy
|
||||
|
||||
#define cudaEvent_t hipEvent_t
|
||||
#define cudaEventCreate hipEventCreate
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDestroy hipEventDestroy
|
||||
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaGetDeviceCount hipGetDeviceCount
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
|
||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||
#define cudaDeviceGetAttribute hipDeviceGetAttribute
|
||||
|
||||
#define cudaMallocHost hipMallocHost
|
||||
#define cudaFreeHost hipFreeHost
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaFree hipFree
|
||||
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyDefault hipMemcpyDefault
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyHostToHost hipMemcpyHostToHost
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define cudaMemsetAsync hipMemsetAsync
|
||||
#define cudaMemset hipMemset
|
||||
|
||||
#define cudaPointerAttributes hipPointerAttribute_t
|
||||
#define cudaPointerGetAttributes hipPointerGetAttributes
|
||||
|
||||
#define cudaMemGetInfo hipMemGetInfo
|
||||
#define cudaFuncSetAttribute hipFuncSetAttribute
|
||||
|
||||
#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount
|
||||
#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
|
||||
namespace thrust {
|
||||
namespace hip {
|
||||
}
|
||||
}
|
||||
|
||||
namespace thrust {
|
||||
namespace cuda = thrust::hip;
|
||||
}
|
||||
|
||||
namespace hipcub {
|
||||
}
|
||||
|
||||
namespace cub = hipcub;
|
||||
|
||||
#endif
|
||||
@@ -2,6 +2,8 @@
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include <thrust/binary_search.h> // thrust::upper_bound
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
@@ -1218,3 +1220,7 @@ class LDGIterator {
|
||||
}
|
||||
};
|
||||
} // namespace dh
|
||||
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
#include "device_helpers.hip.h"
|
||||
#endif
|
||||
|
||||
1137
src/common/device_helpers.hip.h
Normal file
1137
src/common/device_helpers.hip.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -84,10 +84,19 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
|
||||
template <std::uint32_t kBlockThreads, typename Kernel>
|
||||
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
|
||||
int n_mps = 0;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
|
||||
#endif
|
||||
int n_blocks_per_mp = 0;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, shared_mem));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, shared_mem));
|
||||
#endif
|
||||
std::uint32_t grid_size = n_blocks_per_mp * n_mps;
|
||||
return grid_size;
|
||||
}
|
||||
@@ -170,6 +179,7 @@ void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per
|
||||
column_sizes_scan->begin(), [=] __device__(size_t column_size) {
|
||||
return thrust::min(num_cuts_per_feature, column_size);
|
||||
});
|
||||
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
|
||||
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
|
||||
@@ -294,6 +304,7 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
&column_sizes_scan,
|
||||
&sorted_entries);
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
|
||||
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
||||
sorted_entries.end(), detail::EntryCompareOp());
|
||||
|
||||
@@ -353,11 +364,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
|
||||
return weights[group_idx];
|
||||
});
|
||||
|
||||
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
||||
weight_iter + begin, weight_iter + end,
|
||||
batch_iter + begin,
|
||||
d_temp_weights.data(), // output
|
||||
is_valid);
|
||||
|
||||
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
||||
} else {
|
||||
CHECK_EQ(batch.NumRows(), weights.size());
|
||||
@@ -366,11 +379,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
[=]__device__(size_t idx) -> float {
|
||||
return weights[batch.GetElement(idx).row_idx];
|
||||
});
|
||||
|
||||
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
||||
weight_iter + begin, weight_iter + end,
|
||||
batch_iter + begin,
|
||||
d_temp_weights.data(), // output
|
||||
is_valid);
|
||||
|
||||
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
||||
}
|
||||
|
||||
|
||||
4
src/common/hist_util.hip
Normal file
4
src/common/hist_util.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "hist_util.cu"
|
||||
#endif
|
||||
@@ -1,7 +1,7 @@
|
||||
/**
|
||||
* Copyright 2017-2023 by XGBoost contributors
|
||||
*/
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
// dummy implementation of HostDeviceVector in case CUDA is not used
|
||||
|
||||
@@ -199,4 +199,4 @@ template class HostDeviceVector<std::size_t>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
#endif // XGBOOST_USE_CUDA && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
@@ -139,6 +139,7 @@ class HostDeviceVectorImpl {
|
||||
auto ptr = other->ConstDevicePointer();
|
||||
SetDevice();
|
||||
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
||||
ptr,
|
||||
other->Size() * sizeof(T),
|
||||
@@ -195,6 +196,7 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = access;
|
||||
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
|
||||
SetDevice();
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(),
|
||||
data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T),
|
||||
@@ -211,6 +213,7 @@ class HostDeviceVectorImpl {
|
||||
// data is on the host
|
||||
LazyResizeDevice(data_h_.size());
|
||||
SetDevice();
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
|
||||
data_h_.data(),
|
||||
data_d_->size() * sizeof(T),
|
||||
@@ -239,6 +242,7 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
@@ -248,6 +252,7 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
4
src/common/host_device_vector.hip
Normal file
4
src/common/host_device_vector.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "host_device_vector.cu"
|
||||
#endif
|
||||
@@ -12,7 +12,8 @@
|
||||
namespace xgboost {
|
||||
namespace linalg {
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
|
||||
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
|
||||
{
|
||||
dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
|
||||
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
|
||||
"For function with return, use transform instead.");
|
||||
@@ -28,7 +29,8 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s
|
||||
}
|
||||
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
|
||||
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
|
||||
{
|
||||
if (t.Contiguous()) {
|
||||
auto ptr = t.Values().data();
|
||||
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
|
||||
|
||||
@@ -42,7 +42,7 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& f
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
|
||||
common::AssertGPUSupport();
|
||||
@@ -60,7 +60,7 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
|
||||
}
|
||||
ElementWiseKernelHost(t, ctx->Threads(), fn);
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
template <typename T, std::int32_t kDim>
|
||||
auto cbegin(TensorView<T, kDim> const& v) { // NOLINT
|
||||
|
||||
@@ -143,7 +143,7 @@ CheckNAN(T) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
|
||||
#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
|
||||
|
||||
bool CheckNAN(double v);
|
||||
|
||||
@@ -152,23 +152,27 @@ bool CheckNAN(double v);
|
||||
XGBOOST_DEVICE bool inline CheckNAN(float x) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return isnan(x);
|
||||
#elif defined(__HIPCC__)
|
||||
return __builtin_isnan(x);
|
||||
#else
|
||||
return std::isnan(x);
|
||||
#endif // defined(__CUDA_ARCH__)
|
||||
#endif // defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool inline CheckNAN(double x) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return isnan(x);
|
||||
#elif defined(__HIPCC__)
|
||||
return __builtin_isnan(x);
|
||||
#else
|
||||
return std::isnan(x);
|
||||
#endif // defined(__CUDA_ARCH__)
|
||||
#endif // defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
}
|
||||
|
||||
#endif // XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
|
||||
// GPU version is not uploaded in CRAN anyway.
|
||||
// Specialize only when using R with CPU.
|
||||
#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA)
|
||||
#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
double LogGamma(double v);
|
||||
|
||||
#else // Not R or R with GPU.
|
||||
@@ -191,7 +195,7 @@ XGBOOST_DEVICE inline T LogGamma(T v) {
|
||||
#endif // _MSC_VER
|
||||
}
|
||||
|
||||
#endif // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA)
|
||||
#endif // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -15,6 +15,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||
values.SetDevice(ctx->gpu_id);
|
||||
auto const d_values = values.ConstDeviceSpan();
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
||||
thrust::plus<float>{});
|
||||
}
|
||||
|
||||
@@ -99,12 +99,12 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
|
||||
|
||||
namespace cuda_impl {
|
||||
double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline double Reduce(Context const*, HostDeviceVector<float> const&) {
|
||||
AssertGPUSupport();
|
||||
return 0;
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
} // namespace cuda_impl
|
||||
|
||||
/**
|
||||
|
||||
4
src/common/numeric.hip
Normal file
4
src/common/numeric.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "numeric.cu"
|
||||
#endif
|
||||
@@ -109,6 +109,7 @@ template <typename T, typename U>
|
||||
void CopyTo(Span<T> out, Span<U> src) {
|
||||
CHECK_EQ(out.size(), src.size());
|
||||
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
|
||||
out.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
@@ -162,6 +163,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
||||
// Compute output ptr
|
||||
auto transform_it =
|
||||
thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
|
||||
|
||||
thrust::transform(
|
||||
thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
|
||||
out_ptr.data(),
|
||||
@@ -211,6 +213,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
|
||||
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
CHECK_EQ(d_x.size() + d_y.size(), out.size());
|
||||
CHECK_EQ(x_ptr.size(), out_ptr.size());
|
||||
CHECK_EQ(y_ptr.size(), out_ptr.size());
|
||||
@@ -309,6 +312,7 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT> cuts_ptr,
|
||||
size_t total_cuts, Span<float> weights) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
Span<SketchEntry> out;
|
||||
dh::device_vector<SketchEntry> cuts;
|
||||
bool first_window = this->Current().empty();
|
||||
@@ -378,6 +382,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
});
|
||||
// Reverse scan to accumulate weights into first duplicated element on left.
|
||||
auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
|
||||
|
||||
thrust::inclusive_scan_by_key(
|
||||
thrust::cuda::par(alloc), key_it, key_it + entries.size(),
|
||||
val_it, val_it,
|
||||
@@ -443,6 +448,7 @@ void SketchContainer::Prune(size_t to) {
|
||||
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
Span<SketchEntry const> that) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
timer_.Start(__func__);
|
||||
if (this->Current().size() == 0) {
|
||||
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
|
||||
@@ -478,6 +484,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
|
||||
void SketchContainer::FixError() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
||||
auto in = dh::ToSpan(this->Current());
|
||||
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
|
||||
@@ -635,10 +642,12 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
||||
CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
|
||||
max_values.resize(d_in_columns_ptr.size() - 1);
|
||||
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
|
||||
|
||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
|
||||
thrust::make_discard_iterator(), d_max_values.begin(),
|
||||
thrust::equal_to<bst_feature_t>{},
|
||||
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
|
||||
|
||||
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
|
||||
auto max_it = MakeIndexTransformIter([&](auto i) {
|
||||
if (IsCat(h_feature_types, i)) {
|
||||
|
||||
@@ -175,6 +175,7 @@ class SketchContainer {
|
||||
template <typename KeyComp = thrust::equal_to<size_t>>
|
||||
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
||||
timer_.Start(__func__);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||
@@ -186,11 +187,13 @@ class SketchContainer {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||
|
||||
size_t n_uniques = dh::SegmentedUnique(
|
||||
thrust::cuda::par(alloc), d_column_scan.data(),
|
||||
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
||||
entries.data() + entries.size(), scan_out.DevicePointer(),
|
||||
entries.data(), detail::SketchUnique{}, key_comp);
|
||||
|
||||
this->columns_ptr_.Copy(scan_out);
|
||||
CHECK(!this->columns_ptr_.HostCanRead());
|
||||
|
||||
|
||||
4
src/common/quantile.hip
Normal file
4
src/common/quantile.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "quantile.cu"
|
||||
#endif
|
||||
@@ -62,7 +62,7 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
|
||||
return rank;
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
|
||||
common::Span<float const>) {
|
||||
@@ -108,9 +108,9 @@ void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
|
||||
});
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
DMLC_REGISTER_PARAMETER(LambdaRankParam);
|
||||
|
||||
@@ -120,7 +120,7 @@ void PreCache::InitOnCPU(Context const*, MetaInfo const& info) {
|
||||
[](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
@@ -130,9 +130,9 @@ void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
|
||||
[](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
|
||||
std::string out_name;
|
||||
|
||||
@@ -23,6 +23,10 @@
|
||||
#include "xgboost/logging.h" // for CHECK
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
namespace xgboost::ltr {
|
||||
namespace cuda_impl {
|
||||
void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
|
||||
|
||||
4
src/common/ranking_utils.hip
Normal file
4
src/common/ranking_utils.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "ranking_utils.cu"
|
||||
#endif
|
||||
@@ -18,6 +18,7 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace cuda_impl {
|
||||
|
||||
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
||||
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
|
||||
CHECK_GE(t.Shape(1), 1);
|
||||
|
||||
@@ -216,6 +216,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
|
||||
detail::SegOp<SegIt>{seg_beg, seg_end});
|
||||
auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||
detail::WeightOp<WIter>{w_begin, d_sorted_idx});
|
||||
|
||||
thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
|
||||
scan_val, weights_cdf.begin());
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWe
|
||||
|
||||
void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights,
|
||||
linalg::Tensor<float, 1>*) {
|
||||
common::AssertGPUSupport();
|
||||
@@ -120,7 +120,7 @@ inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalW
|
||||
inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
} // namespace cuda_impl
|
||||
|
||||
/**
|
||||
|
||||
4
src/common/stats.hip
Normal file
4
src/common/stats.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "stats.cu"
|
||||
#endif
|
||||
@@ -25,12 +25,12 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::common::ProbabilityDistributionType);
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#if !defined(__CUDACC__) && !defined(__HIPCC__)
|
||||
|
||||
using std::log;
|
||||
using std::fmax;
|
||||
|
||||
#endif // __CUDACC__
|
||||
#endif // __CUDACC__ && __HIPCC__
|
||||
|
||||
enum class CensoringType : uint8_t {
|
||||
kUncensored, kRightCensored, kLeftCensored, kIntervalCensored
|
||||
|
||||
@@ -59,9 +59,11 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
|
||||
});
|
||||
dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
|
||||
out_group_threads_ptr.size());
|
||||
std::size_t total = 0;
|
||||
size_t total = 0;
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||
sizeof(total), cudaMemcpyDeviceToHost));
|
||||
sizeof(total), cudaMemcpyDeviceToHost));
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,9 @@
|
||||
|
||||
#if defined (__CUDACC__)
|
||||
#include "device_helpers.cuh"
|
||||
#endif // defined (__CUDACC__)
|
||||
#elif defined(__HIPCC__)
|
||||
#include "device_helpers.hip.h"
|
||||
#endif // defined (__CUDACC__) || defined(__HIPCC__)
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -28,7 +30,7 @@ constexpr size_t kBlockThreads = 256;
|
||||
|
||||
namespace detail {
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
template <typename Functor, typename... SpanType>
|
||||
__global__ void LaunchCUDAKernel(Functor _func, Range _range,
|
||||
SpanType... _spans) {
|
||||
@@ -36,7 +38,7 @@ __global__ void LaunchCUDAKernel(Functor _func, Range _range,
|
||||
_func(i, _spans...);
|
||||
}
|
||||
}
|
||||
#endif // defined(__CUDACC__)
|
||||
#endif // defined(__CUDACC__) || defined(__HIPCC__)
|
||||
|
||||
} // namespace detail
|
||||
|
||||
@@ -127,7 +129,7 @@ class Transform {
|
||||
UnpackShard(device, _vectors...);
|
||||
}
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
|
||||
typename... HDV>
|
||||
void LaunchCUDA(Functor _func, HDV*... _vectors) const {
|
||||
@@ -140,7 +142,9 @@ class Transform {
|
||||
// granularity is used in data vector.
|
||||
size_t shard_size = range_size;
|
||||
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
const int kGrids =
|
||||
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
|
||||
if (kGrids == 0) {
|
||||
@@ -159,7 +163,7 @@ class Transform {
|
||||
|
||||
LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA();
|
||||
}
|
||||
#endif // defined(__CUDACC__)
|
||||
#endif // defined(__CUDACC__) || defined(__HIPCC__)
|
||||
|
||||
template <typename... HDV>
|
||||
void LaunchCPU(Functor func, HDV *...vectors) const {
|
||||
|
||||
@@ -28,7 +28,7 @@ Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
|
||||
namespace {
|
||||
inline constexpr char const* kDevice = "device";
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
device = DeviceOrd::CPU();
|
||||
return device;
|
||||
@@ -208,10 +208,10 @@ std::int32_t Context::Threads() const {
|
||||
return n_threads;
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
CUDAContext const* Context::CUDACtx() const {
|
||||
common::AssertGPUSupport();
|
||||
return nullptr;
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
} // namespace xgboost
|
||||
|
||||
4
src/context.hip
Normal file
4
src/context.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "context.cu"
|
||||
#endif
|
||||
@@ -20,7 +20,9 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
|
||||
* case where 0 might be given should either use None, 1, or 2 instead for
|
||||
* clarity.
|
||||
*/
|
||||
#ifndef XGBOOST_USE_HIP
|
||||
LOG(FATAL) << "Invalid stream ID in array interface: " << stream;
|
||||
#endif
|
||||
case 1:
|
||||
// default legacy stream
|
||||
break;
|
||||
@@ -38,6 +40,8 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
|
||||
if (!ptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaPointerAttributes attr;
|
||||
auto err = cudaPointerGetAttributes(&attr, ptr);
|
||||
// reset error
|
||||
@@ -59,5 +63,34 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
|
||||
// other errors, `cudaErrorNoDevice`, `cudaErrorInsufficientDriver` etc.
|
||||
return false;
|
||||
}
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
hipPointerAttribute_t attr;
|
||||
auto err = hipPointerGetAttributes(&attr, ptr);
|
||||
// reset error
|
||||
CHECK_EQ(err, hipGetLastError());
|
||||
if (err == hipErrorInvalidValue) {
|
||||
return false;
|
||||
} else if (err == hipSuccess) {
|
||||
#if HIP_VERSION_MAJOR < 6
|
||||
switch (attr.memoryType) {
|
||||
case hipMemoryTypeHost:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
switch (attr.type) {
|
||||
case hipMemoryTypeUnregistered:
|
||||
case hipMemoryTypeHost:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include "cuda_fp16.h"
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
#include <hip/hip_fp16.h>
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
@@ -308,12 +310,12 @@ class ArrayInterfaceHandler {
|
||||
template <typename T, typename E = void>
|
||||
struct ToDType;
|
||||
// float
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
template <>
|
||||
struct ToDType<__half> {
|
||||
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
|
||||
};
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
template <>
|
||||
struct ToDType<float> {
|
||||
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
|
||||
@@ -362,10 +364,10 @@ struct ToDType<int64_t> {
|
||||
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
|
||||
};
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
|
||||
inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
/**
|
||||
* \brief A type erased view over __array_interface__ protocol defined by numpy
|
||||
@@ -463,11 +465,11 @@ class ArrayInterface {
|
||||
CHECK(sizeof(long double) == 16) << error::NoF128();
|
||||
type = T::kF16;
|
||||
} else if (typestr[1] == 'f' && typestr[2] == '2') {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
type = T::kF2;
|
||||
#else
|
||||
LOG(FATAL) << "Half type is not supported.";
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
} else if (typestr[1] == 'f' && typestr[2] == '4') {
|
||||
type = T::kF4;
|
||||
} else if (typestr[1] == 'f' && typestr[2] == '8') {
|
||||
@@ -502,15 +504,15 @@ class ArrayInterface {
|
||||
using T = ArrayInterfaceHandler::Type;
|
||||
switch (type) {
|
||||
case T::kF2: {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
return func(reinterpret_cast<__half const *>(data));
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || || defined(XGBOOST_USE_HIP)
|
||||
}
|
||||
case T::kF4:
|
||||
return func(reinterpret_cast<float const *>(data));
|
||||
case T::kF8:
|
||||
return func(reinterpret_cast<double const *>(data));
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(__CUDA_ARCH__ ) || defined(__HIPCC__)
|
||||
case T::kF16: {
|
||||
// CUDA device code doesn't support long double.
|
||||
SPAN_CHECK(false);
|
||||
@@ -557,7 +559,7 @@ class ArrayInterface {
|
||||
static_assert(sizeof...(index) <= D, "Invalid index.");
|
||||
return this->DispatchCall([=](auto const *p_values) -> T {
|
||||
std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
// No operator defined for half -> size_t
|
||||
using Type = std::conditional_t<
|
||||
std::is_same<__half,
|
||||
@@ -567,7 +569,7 @@ class ArrayInterface {
|
||||
return static_cast<T>(static_cast<Type>(p_values[offset]));
|
||||
#else
|
||||
return static_cast<T>(p_values[offset]);
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
});
|
||||
}
|
||||
|
||||
@@ -604,7 +606,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
|
||||
};
|
||||
switch (array.type) {
|
||||
case ArrayInterfaceHandler::kF2: {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
dispatch(__half{});
|
||||
#endif
|
||||
break;
|
||||
|
||||
4
src/data/array_interface.hip
Normal file
4
src/data/array_interface.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "array_interface.cu"
|
||||
#endif
|
||||
@@ -800,9 +800,9 @@ void MetaInfo::Validate(std::int32_t device) const {
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
void MetaInfo::SetInfoFromCUDA(Context const&, StringView, Json) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
bool MetaInfo::IsVerticalFederated() const {
|
||||
return collective::IsFederated() && IsColumnSplit();
|
||||
|
||||
@@ -44,6 +44,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
||||
std::copy(array.shape, array.shape + D, shape.data());
|
||||
// set data
|
||||
data->Resize(array.n);
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
|
||||
cudaMemcpyDefault, ctx->Stream()));
|
||||
});
|
||||
@@ -95,8 +96,10 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
||||
}
|
||||
});
|
||||
bool non_dec = true;
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
|
||||
size_t bytes = 0;
|
||||
dh::caching_device_vector<uint32_t> out(array_interface.Shape(0));
|
||||
@@ -114,8 +117,10 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
||||
group_ptr_.clear();
|
||||
group_ptr_.resize(h_num_runs_out + 1, 0);
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(),
|
||||
cnt.begin() + h_num_runs_out, cnt.begin());
|
||||
|
||||
thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out,
|
||||
group_ptr_.begin() + 1);
|
||||
}
|
||||
|
||||
4
src/data/data.hip
Normal file
4
src/data/data.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "data.cu"
|
||||
#endif
|
||||
@@ -122,7 +122,9 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
|
||||
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
|
||||
CHECK_NE(device_idx_, Context::kCpuId);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
|
||||
for (auto& json_col : json_columns) {
|
||||
auto column = ArrayInterface<1>(get<Object const>(json_col));
|
||||
columns.push_back(column);
|
||||
@@ -211,6 +213,7 @@ template <typename AdapterBatchT>
|
||||
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
|
||||
float missing) {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
IsValidFunctor is_valid(missing);
|
||||
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||
|
||||
@@ -244,6 +247,7 @@ std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offs
|
||||
&offset[ridx]),
|
||||
static_cast<unsigned long long>(cnt)); // NOLINT
|
||||
});
|
||||
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
bst_row_t row_stride =
|
||||
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost contributors
|
||||
*/
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
#include "ellpack_page.h"
|
||||
|
||||
@@ -52,5 +52,4 @@ size_t EllpackPage::Size() const {
|
||||
return impl_->Cuts();
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
#include "gradient_index.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include <rocprim/rocprim.hpp>
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {}
|
||||
@@ -102,6 +106,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
row_stride(row_stride),
|
||||
n_rows(n_rows) {
|
||||
monitor_.Init("ellpack_page");
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
@@ -256,6 +261,8 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
||||
// Go one level down into cub::DeviceScan API to set OffsetT as 64 bit
|
||||
// So we don't crash on n > 2^31
|
||||
size_t temp_storage_bytes = 0;
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
using DispatchScan =
|
||||
cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
|
||||
TupleScanOp<Tuple>, cub::NullType, int64_t>;
|
||||
@@ -278,6 +285,17 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
||||
key_value_index_iter, out, TupleScanOp<Tuple>(),
|
||||
cub::NullType(), batch.Size(), nullptr, false);
|
||||
#endif
|
||||
|
||||
#elif defined (__HIPCC__)
|
||||
|
||||
rocprim::inclusive_scan(nullptr, temp_storage_bytes, key_value_index_iter, out, batch.Size(), TupleScanOp<Tuple>());
|
||||
|
||||
dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
|
||||
|
||||
rocprim::inclusive_scan(temp_storage.data().get(), temp_storage_bytes, key_value_index_iter, out, batch.Size(),
|
||||
TupleScanOp<Tuple>());
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void WriteNullValues(EllpackPageImpl* dst, int device_idx,
|
||||
@@ -534,11 +552,13 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
// copy data entries to device.
|
||||
if (row_batch.data.DeviceCanRead()) {
|
||||
auto const& d_data = row_batch.data.ConstDeviceSpan();
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
entries_d.data().get(), d_data.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
} else {
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
|
||||
4
src/data/ellpack_page.hip
Normal file
4
src/data/ellpack_page.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "ellpack_page.cu"
|
||||
#endif
|
||||
4
src/data/ellpack_page_raw_format.hip
Normal file
4
src/data/ellpack_page_raw_format.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "ellpack_page_raw_format.cu"
|
||||
#endif
|
||||
@@ -45,7 +45,7 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
|
||||
void Fetch() final;
|
||||
};
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline void EllpackPageSource::Fetch() {
|
||||
// silent the warning about unused variables.
|
||||
(void)(row_stride_);
|
||||
|
||||
4
src/data/ellpack_page_source.hip
Normal file
4
src/data/ellpack_page_source.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "ellpack_page_source.cu"
|
||||
#endif
|
||||
@@ -65,12 +65,12 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
|
||||
max_numeric_bins_per_feat(max_bin_per_feat),
|
||||
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
GHistIndexMatrix::GHistIndexMatrix(Context const *, MetaInfo const &, EllpackPage const &,
|
||||
BatchParam const &) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
|
||||
GHistIndexMatrix::~GHistIndexMatrix() = default;
|
||||
|
||||
|
||||
4
src/data/gradient_index.hip
Normal file
4
src/data/gradient_index.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "gradient_index.cu"
|
||||
#endif
|
||||
@@ -357,7 +357,7 @@ BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(Context const* ctx,
|
||||
return BatchSet<ExtSparsePage>(begin_iter);
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, DataIterHandle, float,
|
||||
std::shared_ptr<DMatrix>) {
|
||||
// silent the warning about unused variables.
|
||||
@@ -377,5 +377,5 @@ inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const*,
|
||||
inline void GetCutsFromEllpack(EllpackPage const&, common::HistogramCuts*) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -46,7 +46,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
bst_feature_t cols = 0;
|
||||
|
||||
int32_t current_device;
|
||||
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
|
||||
auto get_device = [&]() -> int32_t {
|
||||
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
|
||||
CHECK_NE(d, Context::kCpuId);
|
||||
@@ -61,7 +63,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
// We use do while here as the first batch is fetched in ctor
|
||||
// ctx_.gpu_id = proxy->DeviceIdx();
|
||||
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
|
||||
if (cols == 0) {
|
||||
cols = num_cols();
|
||||
collective::Allreduce<collective::Operation::kMax>(&cols, 1);
|
||||
@@ -85,7 +89,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
|
||||
return GetRowCounts(value, row_counts_span, get_device(), missing);
|
||||
}));
|
||||
|
||||
nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
|
||||
|
||||
batches++;
|
||||
} while (iter.Next());
|
||||
iter.Reset();
|
||||
@@ -94,6 +100,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
|
||||
if (!ref) {
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
common::SketchContainer final_sketch(
|
||||
@@ -132,7 +139,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
size_t n_batches_for_verification = 0;
|
||||
while (iter.Next()) {
|
||||
init_page();
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
|
||||
auto rows = num_rows();
|
||||
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
||||
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
||||
|
||||
4
src/data/iterative_dmatrix.hip
Normal file
4
src/data/iterative_dmatrix.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "iterative_dmatrix.cu"
|
||||
#endif
|
||||
@@ -28,7 +28,7 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
|
||||
namespace cuda_impl {
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
|
||||
std::shared_ptr<DMatrixProxy> proxy, float missing);
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
|
||||
float) {
|
||||
return nullptr;
|
||||
|
||||
@@ -40,10 +40,10 @@ class DMatrixProxy : public DMatrix {
|
||||
std::any batch_;
|
||||
Context ctx_;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
void FromCudaColumnar(StringView interface_str);
|
||||
void FromCudaArray(StringView interface_str);
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
|
||||
public:
|
||||
int DeviceIdx() const { return ctx_.gpu_id; }
|
||||
@@ -51,7 +51,7 @@ class DMatrixProxy : public DMatrix {
|
||||
void SetCUDAArray(char const* c_interface) {
|
||||
common::AssertGPUSupport();
|
||||
CHECK(c_interface);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
StringView interface_str{c_interface};
|
||||
Json json_array_interface = Json::Load(interface_str);
|
||||
if (IsA<Array>(json_array_interface)) {
|
||||
@@ -59,7 +59,7 @@ class DMatrixProxy : public DMatrix {
|
||||
} else {
|
||||
this->FromCudaArray(interface_str);
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
}
|
||||
|
||||
void SetArrayData(StringView interface_str);
|
||||
|
||||
4
src/data/proxy_dmatrix.hip
Normal file
4
src/data/proxy_dmatrix.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "proxy_dmatrix.cu"
|
||||
#endif
|
||||
@@ -24,6 +24,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
|
||||
auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
|
||||
: adapter->DeviceIdx();
|
||||
CHECK_GE(device, 0);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
Context ctx;
|
||||
|
||||
@@ -15,14 +15,28 @@
|
||||
|
||||
namespace xgboost::data {
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
template <typename AdapterBatchT>
|
||||
struct COOToEntryOp {
|
||||
AdapterBatchT batch;
|
||||
|
||||
__device__ Entry operator()(size_t idx) {
|
||||
const auto& e = batch.GetElement(idx);
|
||||
return Entry(e.column_idx, e.value);
|
||||
}
|
||||
};
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
template <typename AdapterBatchT>
|
||||
struct COOToEntryOp : thrust::unary_function<size_t, Entry> {
|
||||
AdapterBatchT batch;
|
||||
COOToEntryOp(AdapterBatchT batch): batch(batch) {};
|
||||
|
||||
__device__ Entry operator()(size_t idx) {
|
||||
const auto& e = batch.GetElement(idx);
|
||||
return Entry(e.column_idx, e.value);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
// Here the data is already correctly ordered and simply needs to be compacted
|
||||
// to remove missing data
|
||||
@@ -42,7 +56,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
|
||||
template <typename AdapterBatchT>
|
||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
int device_idx, float missing) {
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
IsValidFunctor is_valid(missing);
|
||||
// Count elements per row
|
||||
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
|
||||
@@ -55,6 +71,7 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
});
|
||||
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc),
|
||||
thrust::device_pointer_cast(offset.data()),
|
||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||
|
||||
4
src/data/simple_dmatrix.hip
Normal file
4
src/data/simple_dmatrix.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "simple_dmatrix.cu"
|
||||
#endif
|
||||
@@ -19,7 +19,7 @@ const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
|
||||
namespace detail {
|
||||
// Use device dispatch
|
||||
std::size_t NSamplesDevice(DMatrixProxy *) // NOLINT
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
@@ -28,7 +28,7 @@ std::size_t NSamplesDevice(DMatrixProxy *) // NOLINT
|
||||
}
|
||||
#endif
|
||||
std::size_t NFeaturesDevice(DMatrixProxy *) // NOLINT
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
@@ -194,11 +194,11 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
|
||||
return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(begin_iter));
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const *, const BatchParam &) {
|
||||
common::AssertGPUSupport();
|
||||
auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
|
||||
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
} // namespace xgboost::data
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
} // namespace data
|
||||
|
||||
4
src/data/sparse_page_dmatrix.hip
Normal file
4
src/data/sparse_page_dmatrix.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "sparse_page_dmatrix.cu"
|
||||
#endif
|
||||
@@ -290,7 +290,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
// Push data from CUDA.
|
||||
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
|
||||
#else
|
||||
|
||||
4
src/data/sparse_page_source.hip
Normal file
4
src/data/sparse_page_source.hip
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "sparse_page_source.cu"
|
||||
#endif
|
||||
@@ -13,7 +13,7 @@ namespace xgboost {
|
||||
namespace data {
|
||||
struct LabelsCheck {
|
||||
XGBOOST_DEVICE bool operator()(float y) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
|
||||
return ::isnan(y) || ::isinf(y);
|
||||
#else
|
||||
return std::isnan(y) || std::isinf(y);
|
||||
|
||||
@@ -103,7 +103,7 @@ void GBTree::Configure(Args const& cfg) {
|
||||
cpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", this->ctx_));
|
||||
}
|
||||
cpu_predictor_->Configure(cfg);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
auto n_gpus = common::AllVisibleGPUs();
|
||||
if (!gpu_predictor_) {
|
||||
gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
|
||||
@@ -111,7 +111,7 @@ void GBTree::Configure(Args const& cfg) {
|
||||
if (n_gpus != 0) {
|
||||
gpu_predictor_->Configure(cfg);
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
|
||||
#if defined(XGBOOST_USE_ONEAPI)
|
||||
if (!oneapi_predictor_) {
|
||||
@@ -150,7 +150,7 @@ void GBTree::Configure(Args const& cfg) {
|
||||
|
||||
void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_group_t,
|
||||
HostDeviceVector<GradientPair>*)
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
@@ -620,7 +620,7 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
|
||||
*/
|
||||
void GPUDartPredictInc(common::Span<float>, common::Span<float>, float, size_t, bst_group_t,
|
||||
bst_group_t)
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
@@ -632,7 +632,7 @@ void GPUDartInplacePredictInc(common::Span<float> /*out_predts*/, common::Span<f
|
||||
float /*tree_w*/, size_t /*n_rows*/,
|
||||
linalg::TensorView<float const, 1> /*base_score*/,
|
||||
bst_group_t /*n_groups*/, bst_group_t /*group*/)
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user