Compare commits

..

182 Commits

Author SHA1 Message Date
Hui Liu
230dc11603 fix memory type 2024-01-26 15:44:24 -08:00
Hui Liu
dc7ee041cc use __HIPCC__ for device code 2024-01-24 12:32:51 -08:00
Hui Liu
7dc152450e workaround memoryType and change rccl config 2024-01-11 13:13:14 -08:00
Hui Liu
82d81bca94 rm hip.h files 2023-10-30 21:54:00 -07:00
Hui Liu
6ec5cf26fc enable 3 more tests 2023-10-30 15:27:02 -07:00
Hui Liu
1ec57fd1a3 enable ROCm support, rm un-necessary code 2023-10-30 12:39:30 -07:00
Hui Liu
d0774a78e4 add hip to config 2023-10-30 12:01:24 -07:00
Hui Liu
8d160a206e add jvm rocm support 2023-10-30 11:49:47 -07:00
Hui Liu
a41bc0975c rocm enable for v2.0.1, rm setup.py 2023-10-27 18:53:16 -07:00
Hui Liu
782b73f2bb rocm enable for v2.0.1 2023-10-27 18:50:28 -07:00
amdsc21
2e7e9d3b2d update rocgputreeshap branch 2023-06-23 19:50:08 +02:00
amdsc21
3e0c7d1dee new url for rocgputreeshap 2023-06-23 19:46:45 +02:00
amdsc21
2f47a1ebe6 rm warp-primitives 2023-06-22 21:43:00 +02:00
amdsc21
5ca7daaa13 merge latest changes 2023-06-15 21:39:14 +02:00
amdsc21
5f78360949 merge changes Jun092023 2023-06-09 22:41:33 +02:00
amdsc21
35cde3b1b2 remove some hip.h 2023-06-07 04:48:09 +02:00
amdsc21
ce345c30a8 remove some hip.h 2023-06-07 03:39:01 +02:00
amdsc21
af8845405a sync Jun 5 2023-06-07 02:43:21 +02:00
amdsc21
9ee1852d4e restore device helper 2023-06-02 02:55:13 +02:00
Your Name
6ecd7903f2 Merge branch 'master' into sync-condition-2023Jun01 2023-06-01 15:58:31 -07:00
Your Name
42867a4805 sync Jun 1 2023-06-01 15:55:06 -07:00
amdsc21
c5b575e00e fix host __assert_fail 2023-05-24 19:40:24 +02:00
amdsc21
1354138b7d Merge branch 'master' into sync-condition-2023May15 2023-05-24 17:44:16 +02:00
amdsc21
b994a38b28 Merge branch 'master' into sync-condition-2023May15 2023-05-23 01:07:50 +02:00
amdsc21
3a834c4992 change workflow 2023-05-20 07:04:06 +02:00
amdsc21
b22644fc10 add hip.h 2023-05-20 01:25:33 +02:00
amdsc21
7663d47383 Merge branch 'master' into sync-condition-2023May15 2023-05-19 20:30:35 +02:00
amdsc21
88fc8badfa Merge branch 'master' into sync-condition-2023May15 2023-05-17 19:55:50 +02:00
amdsc21
8cad8c693c sync up May15 2023 2023-05-15 18:59:18 +02:00
amdsc21
b066accad6 fix lambdarank_obj 2023-05-02 21:06:22 +02:00
amdsc21
b324d51f14 fix array_interface.h half type 2023-05-02 20:50:50 +02:00
amdsc21
65097212b3 fix IterativeDeviceDMatrix, support HIP 2023-05-02 20:20:11 +02:00
amdsc21
4a24ca2f95 fix helpers.h, enable HIP 2023-05-02 20:04:23 +02:00
amdsc21
83e6fceb5c fix lambdarank_obj.cc, support HIP 2023-05-02 19:03:18 +02:00
amdsc21
e4538cb13c fix, to support hip 2023-05-02 17:43:11 +02:00
amdsc21
5446c501af merge 23Mar01 2023-05-02 00:05:58 +02:00
amdsc21
313a74b582 add Shap Magic to check if use cat 2023-05-01 21:55:14 +02:00
amdsc21
65d83e288f fix device query 2023-04-19 19:53:26 +02:00
amdsc21
f645cf51c1 Merge branch 'master' into sync-condition-2023Apr11 2023-04-17 18:33:00 +02:00
amdsc21
db8420225b fix RCCL 2023-04-12 01:09:14 +02:00
amdsc21
843fdde61b sync Apr 11 2023 2023-04-11 20:03:25 +02:00
amdsc21
08bc4b0c0f Merge branch 'master' into sync-condition-2023Apr11 2023-04-11 19:38:38 +02:00
amdsc21
6825d986fd move Dockerfile to ci 2023-04-11 19:34:23 +02:00
paklui
d155ec77f9 building docker for xgboost-amd-condition 2023-03-30 13:36:39 -07:00
amdsc21
991738690f Merge branch 'sync-condition-2023Mar27' into amd-condition 2023-03-30 05:16:36 +02:00
amdsc21
aeb3fd1c95 Merge branch 'master' into sync-condition-2023Mar27 2023-03-30 05:15:55 +02:00
amdsc21
141a062e00 Merge branch 'sync-condition-2023Mar27' into amd-condition 2023-03-30 00:47:16 +02:00
amdsc21
acad01afc9 sync Mar 29 2023-03-30 00:46:50 +02:00
amdsc21
f289e5001d Merge branch 'sync-condition-2023Mar27' into amd-condition 2023-03-28 00:24:12 +02:00
amdsc21
06d9b998ce fix CAPI BuildInfo 2023-03-28 00:14:18 +02:00
amdsc21
c50cc424bc sync Mar 27 2023 2023-03-27 18:54:41 +02:00
amdsc21
8c77e936d1 tune grid size 2023-03-26 17:45:19 +02:00
amdsc21
18034a4291 tune histogram 2023-03-26 01:42:51 +01:00
amdsc21
7ee4734d3a rm device_helpers.hip.h from cu 2023-03-26 00:24:11 +01:00
amdsc21
ee582f03c3 rm device_helpers.hip.h from cuh 2023-03-25 23:35:57 +01:00
amdsc21
f3286bac04 rm warp header 2023-03-25 23:01:44 +01:00
amdsc21
3ee3bea683 fix warp header 2023-03-25 22:37:37 +01:00
amdsc21
5098735698 Merge branch 'condition-sync-Mar24-23' into hui-condition 2023-03-25 05:28:40 +01:00
amdsc21
e74b3bbf3c fix macro 2023-03-25 05:17:39 +01:00
amdsc21
22525c002a fix macro 2023-03-25 05:08:30 +01:00
amdsc21
80961039d7 fix macro 2023-03-25 05:00:55 +01:00
amdsc21
1474789787 add new file 2023-03-25 04:54:02 +01:00
amdsc21
1dc138404a initial merge, fix linalg.h 2023-03-25 04:48:47 +01:00
amdsc21
e1d050f64e initial merge, fix linalg.h 2023-03-25 04:37:43 +01:00
amdsc21
7fbc561e17 initial merge 2023-03-25 04:31:55 +01:00
amdsc21
d97be6f396 enable last 3 tests 2023-03-25 04:05:05 +01:00
amdsc21
f1211cffca enable last 3 tests 2023-03-25 00:45:52 +01:00
amdsc21
e0716afabf fix objective/objective.cc, CMakeFile and setup.py 2023-03-23 20:22:34 +01:00
amdsc21
595cd81251 add max shared mem workaround 2023-03-19 20:08:42 +01:00
amdsc21
0325ce0bed update gputreeshap 2023-03-19 20:07:36 +01:00
amdsc21
a79a35c22c add warp size 2023-03-15 22:00:26 +01:00
amdsc21
4484c7f073 disable Optin Shared Mem 2023-03-15 02:10:16 +01:00
amdsc21
8207015e48 fix ../tests/cpp/common/test_span.h 2023-03-14 22:19:06 +01:00
amdsc21
364df7db0f fix ../tree/gpu_hist/evaluate_splits.hip bugs, size 64 2023-03-14 06:17:21 +01:00
amdsc21
a2bab03205 fix aft_obj.hip 2023-03-13 23:19:59 +01:00
amdsc21
b71c1b50de fix macro, no ! 2023-03-12 23:02:28 +01:00
amdsc21
fa2336fcfd sort bug fix 2023-03-12 07:09:10 +01:00
amdsc21
7d96758382 macro format 2023-03-11 06:57:24 +01:00
amdsc21
b0dacc5a80 fix bug 2023-03-11 03:47:23 +01:00
amdsc21
f64152bf97 add helpers.hip 2023-03-11 02:56:50 +01:00
amdsc21
b4dbe7a649 fix isnan 2023-03-11 02:39:58 +01:00
amdsc21
e5b6219a84 typo 2023-03-11 02:30:27 +01:00
amdsc21
3a07b1edf8 complete test porting 2023-03-11 02:17:05 +01:00
amdsc21
9bf16a2ca6 testing porting 2023-03-11 01:38:54 +01:00
amdsc21
332f6a89a9 more tests 2023-03-11 01:33:48 +01:00
amdsc21
204d0c9a53 add hip tests 2023-03-11 00:38:16 +01:00
amdsc21
e961016e71 rm HIPCUB 2023-03-10 22:21:37 +01:00
amdsc21
f0b8c02f15 merge latest changes 2023-03-10 22:10:20 +01:00
amdsc21
5e8b1842b9 fix Pointer Attr 2023-03-10 19:06:02 +01:00
amdsc21
9f072b50ba fix __popc 2023-03-10 17:14:31 +01:00
amdsc21
e1ddb5ae58 fix macro XGBOOST_USE_HIP 2023-03-10 07:11:05 +01:00
amdsc21
643e2a7b39 fix macro XGBOOST_USE_HIP 2023-03-10 07:09:41 +01:00
amdsc21
bde3107c3e fix macro XGBOOST_USE_HIP 2023-03-10 07:01:25 +01:00
amdsc21
5edfc1e2e9 finish ellpack_page.cc 2023-03-10 06:41:25 +01:00
amdsc21
c073417d0c finish aft_obj.cu 2023-03-10 06:39:03 +01:00
amdsc21
9bbbeb3f03 finish multiclass_obj.cu 2023-03-10 06:35:46 +01:00
amdsc21
4bde2e3412 finish multiclass_obj.cu 2023-03-10 06:35:21 +01:00
amdsc21
58a9fe07b6 finish multiclass_obj.cu 2023-03-10 06:35:06 +01:00
amdsc21
41407850d5 finish rank_obj.cu 2023-03-10 06:29:08 +01:00
amdsc21
968a1db4c0 finish regression_obj.cu 2023-03-10 06:07:53 +01:00
amdsc21
ad710e4888 finish hinge.cu 2023-03-10 06:04:59 +01:00
amdsc21
4e3c699814 finish adaptive.cu 2023-03-10 06:02:48 +01:00
amdsc21
757de84398 finish quantile.cu 2023-03-10 05:55:51 +01:00
amdsc21
d27f9dfdce finish host_device_vector.cu 2023-03-10 05:45:38 +01:00
amdsc21
14cc438a64 finish stats.cu 2023-03-10 05:38:16 +01:00
amdsc21
911a5d8a60 finish hist_util.cu 2023-03-10 05:32:38 +01:00
amdsc21
54b076b40f finish common.cu 2023-03-10 05:20:29 +01:00
amdsc21
91a5ef762e finish common.cu 2023-03-10 05:19:41 +01:00
amdsc21
8fd2af1c8b finish numeric.cu 2023-03-10 05:16:23 +01:00
amdsc21
bb6adda8a3 finish c_api.cu 2023-03-10 05:12:51 +01:00
amdsc21
a76ccff390 finish c_api.cu 2023-03-10 05:11:20 +01:00
amdsc21
61c0b19331 finish ellpack_page_source.cu 2023-03-10 05:06:36 +01:00
amdsc21
fa9f69dd85 finish sparse_page_dmatrix.cu 2023-03-10 05:04:57 +01:00
amdsc21
080fc35c4b finish ellpack_page_raw_format.cu 2023-03-10 05:02:35 +01:00
amdsc21
ccce4cf7e1 finish data.cu 2023-03-10 05:00:57 +01:00
amdsc21
713ab9e1a0 finish sparse_page_source.cu 2023-03-10 04:42:56 +01:00
amdsc21
134cbfddbe finish gradient_index.cu 2023-03-10 04:40:33 +01:00
amdsc21
6e2c5be83e finish array_interface.cu 2023-03-10 04:36:04 +01:00
amdsc21
185dbce21f finish ellpack_page.cu 2023-03-10 04:26:09 +01:00
amdsc21
49732359ef finish iterative_dmatrix.cu 2023-03-10 03:47:00 +01:00
amdsc21
ec9f500a49 finish proxy_dmatrix.cu 2023-03-10 03:40:07 +01:00
amdsc21
53244bef6f finish simple_dmatrix.cu 2023-03-10 03:38:09 +01:00
amdsc21
f0febfbcac finish gpu_predictor.cu 2023-03-10 01:29:54 +01:00
amdsc21
1c58ff61d1 finish fit_stump.cu 2023-03-10 00:46:29 +01:00
amdsc21
1530c03f7d finish constraints.cu 2023-03-09 22:43:51 +01:00
amdsc21
309268de02 finish updater_gpu_hist.cu 2023-03-09 22:40:44 +01:00
amdsc21
500428cc0f finish row_partitioner.cu 2023-03-09 22:31:11 +01:00
amdsc21
495816f694 finished gradient_based_sampler.cu 2023-03-09 22:26:08 +01:00
amdsc21
df42dd2c53 finished evaluator.cu 2023-03-09 22:22:05 +01:00
amdsc21
f55243fda0 finish evaluate_splits.cu 2023-03-09 22:15:10 +01:00
amdsc21
1e09c21456 finished feature_groups.cu 2023-03-09 21:31:00 +01:00
amdsc21
0ed5d3c849 finished histogram.cu 2023-03-09 21:28:37 +01:00
amdsc21
f67e7de7ef finished communicator.cu 2023-03-09 21:02:48 +01:00
amdsc21
5044713388 finished updater_gpu_coordinate.cu 2023-03-09 20:53:54 +01:00
amdsc21
c875f0425f finished rank_metric.cu 2023-03-09 20:48:31 +01:00
amdsc21
4fd08b6c32 finished survival_metric.cu 2023-03-09 20:41:52 +01:00
amdsc21
b9d86d44d6 finish multiclass_metric.cu 2023-03-09 20:37:16 +01:00
amdsc21
a56055225a fix auc.cu 2023-03-09 20:29:38 +01:00
amdsc21
6eba0a56ec fix CMakeLists.txt 2023-03-09 18:57:14 +01:00
amdsc21
00c24a58b1 finish elementwise_metric.cu 2023-03-08 22:50:07 +01:00
amdsc21
6fa248b75f try elementwise_metric.cu 2023-03-08 22:42:48 +01:00
amdsc21
946f9e9802 fix gbtree.cc 2023-03-08 21:44:20 +01:00
amdsc21
4c4e5af29c port elementwise_metric.cu 2023-03-08 21:39:56 +01:00
amdsc21
7e1b06417b finish gbtree.cu porting 2023-03-08 21:09:56 +01:00
amdsc21
cdd7794641 add unused option 2023-03-08 20:37:53 +01:00
amdsc21
cd743a1ae9 fix DispatchRadixSort 2023-03-08 20:31:23 +01:00
amdsc21
a45005863b fix DispatchScan 2023-03-08 20:15:33 +01:00
amdsc21
bdcb036592 add context.hip 2023-03-08 07:34:19 +01:00
amdsc21
7a3a9b682a add device_helpers.hip.h 2023-03-08 07:18:33 +01:00
amdsc21
0a711662c3 add device_helpers.hip.h 2023-03-08 07:10:32 +01:00
amdsc21
312e58ec99 enable rocm, fix common.h 2023-03-08 06:45:03 +01:00
amdsc21
ca8f4e7993 enable rocm, fix stats.cuh 2023-03-08 06:43:06 +01:00
amdsc21
60795f22de enable rocm, fix linalg_op.cuh 2023-03-08 06:42:20 +01:00
amdsc21
05fdca893f enable rocm, fix cuda_pinned_allocator.h 2023-03-08 06:39:40 +01:00
amdsc21
d8cc93f3f2 enable rocm, fix algorithm.cuh 2023-03-08 06:38:35 +01:00
amdsc21
62c4efac51 enable rocm, fix transform.h 2023-03-08 06:37:34 +01:00
amdsc21
ba9e00d911 enable rocm, fix hist_util.cuh 2023-03-08 06:36:15 +01:00
amdsc21
d3be67ad8e enable rocm, fix quantile.cuh 2023-03-08 06:32:09 +01:00
amdsc21
2eb0b6aae4 enable rocm, fix threading_utils.cuh 2023-03-08 06:30:52 +01:00
amdsc21
327f1494f1 enable rocm, fix cuda_context.cuh 2023-03-08 06:29:45 +01:00
amdsc21
fa92aa56ee enable rocm, fix device_adapter.cuh 2023-03-08 06:26:31 +01:00
amdsc21
427f6c2a1a enable rocm, fix simple_dmatrix.cuh 2023-03-08 06:24:34 +01:00
amdsc21
270c7b4802 enable rocm, fix row_partitioner.cuh 2023-03-08 06:22:25 +01:00
amdsc21
0fc1f640a9 enable rocm, fix nccl_device_communicator.cuh 2023-03-08 06:18:13 +01:00
amdsc21
762fd9028d enable rocm, fix device_communicator_adapter.cuh 2023-03-08 06:13:29 +01:00
amdsc21
f2009533e1 rm hip.h 2023-03-08 06:04:01 +01:00
amdsc21
53b5cd73f2 add hip flags 2023-03-08 03:42:51 +01:00
amdsc21
52b05d934e add hip 2023-03-08 03:32:19 +01:00
amdsc21
840f15209c add HIP flags, common 2023-03-08 03:11:49 +01:00
amdsc21
1e1c7fd8d5 add HIP flags, c_api 2023-03-08 01:34:37 +01:00
amdsc21
f5f800c80d add HIP flags 2023-03-08 01:33:38 +01:00
amdsc21
6b7be96373 add HIP flags 2023-03-08 01:22:25 +01:00
amdsc21
75712b9c3c enable HIP flags 2023-03-08 01:10:07 +01:00
amdsc21
ed45aa2816 Merge branch 'master' into dev-hui 2023-03-08 00:39:33 +01:00
amdsc21
f286ae5bfa add hip rocthrust hipcub 2023-03-07 06:35:00 +01:00
amdsc21
f13a7f8d91 add submodules 2023-03-07 05:44:24 +01:00
amdsc21
c51a1c9aae rename hip.cc to hip 2023-03-07 05:39:53 +01:00
amdsc21
30de728631 fix hip.cc 2023-03-07 05:11:42 +01:00
amdsc21
75fa15b36d add hip support 2023-03-07 04:02:49 +01:00
amdsc21
eb30cb6293 add hip support 2023-03-07 03:49:52 +01:00
amdsc21
cafbfce51f add hip.h 2023-03-07 03:46:26 +01:00
amdsc21
6039a71e6c add hip structure 2023-03-07 02:17:19 +01:00
268 changed files with 2597 additions and 547 deletions

3
.gitmodules vendored
View File

@@ -5,3 +5,6 @@
[submodule "gputreeshap"] [submodule "gputreeshap"]
path = gputreeshap path = gputreeshap
url = https://github.com/rapidsai/gputreeshap.git url = https://github.com/rapidsai/gputreeshap.git
[submodule "rocgputreeshap"]
path = rocgputreeshap
url = https://github.com/ROCmSoftwarePlatform/rocgputreeshap

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR) cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 2.0.2) project(xgboost LANGUAGES CXX C VERSION 2.0.1)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)
@@ -58,7 +58,7 @@ option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC
option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF) option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
option(GOOGLE_TEST "Build google tests" OFF) option(GOOGLE_TEST "Build google tests" OFF)
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF) option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF) option(USE_DEVICE_DEBUG "Generate CUDA/HIP device debug info." OFF)
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF) option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header") set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
option(RABIT_MOCK "Build rabit with mock" OFF) option(RABIT_MOCK "Build rabit with mock" OFF)
@@ -71,6 +71,10 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
set(GPU_COMPUTE_VER "" CACHE STRING set(GPU_COMPUTE_VER "" CACHE STRING
"Semicolon separated list of compute versions to be built against, e.g. '35;61'") "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
## HIP
option(USE_HIP "Build with GPU acceleration" OFF)
option(USE_RCCL "Build with RCCL to enable distributed GPU support." OFF)
option(BUILD_WITH_SHARED_RCCL "Build with shared RCCL library." OFF)
## Copied From dmlc ## Copied From dmlc
option(USE_HDFS "Build with HDFS support" OFF) option(USE_HDFS "Build with HDFS support" OFF)
option(USE_AZURE "Build with AZURE support" OFF) option(USE_AZURE "Build with AZURE support" OFF)
@@ -93,6 +97,7 @@ option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug))) if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.") message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug))) endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
if (USE_NCCL AND NOT (USE_CUDA)) if (USE_NCCL AND NOT (USE_CUDA))
message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.") message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
endif (USE_NCCL AND NOT (USE_CUDA)) endif (USE_NCCL AND NOT (USE_CUDA))
@@ -102,6 +107,17 @@ endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.") message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
if (USE_RCCL AND NOT (USE_HIP))
message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
endif (USE_RCCL AND NOT (USE_HIP))
if (USE_DEVICE_DEBUG AND NOT (USE_HIP))
message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_HIP` flag.")
endif (USE_DEVICE_DEBUG AND NOT (USE_HIP))
if (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
endif (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
if (JVM_BINDINGS AND R_LIB) if (JVM_BINDINGS AND R_LIB)
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.") message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
endif (JVM_BINDINGS AND R_LIB) endif (JVM_BINDINGS AND R_LIB)
@@ -115,9 +131,15 @@ endif (USE_AVX)
if (PLUGIN_LZ4) if (PLUGIN_LZ4)
message(SEND_ERROR "The option 'PLUGIN_LZ4' is removed from XGBoost.") message(SEND_ERROR "The option 'PLUGIN_LZ4' is removed from XGBoost.")
endif (PLUGIN_LZ4) endif (PLUGIN_LZ4)
if (PLUGIN_RMM AND NOT (USE_CUDA)) if (PLUGIN_RMM AND NOT (USE_CUDA))
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.") message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
endif (PLUGIN_RMM AND NOT (USE_CUDA)) endif (PLUGIN_RMM AND NOT (USE_CUDA))
if (PLUGIN_RMM AND NOT (USE_HIP))
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_HIP` flag.")
endif (PLUGIN_RMM AND NOT (USE_HIP))
if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))) if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.") message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))) endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
@@ -170,6 +192,24 @@ if (USE_CUDA)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
endif (USE_CUDA) endif (USE_CUDA)
if (USE_HIP)
set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
# `export CXX=' is ignored by CMake HIP.
set(CMAKE_HIP_HOST_COMPILER ${CMAKE_CXX_COMPILER})
message(STATUS "Configured HIP host compiler: ${CMAKE_HIP_HOST_COMPILER}")
enable_language(HIP)
find_package(hip REQUIRED)
find_package(rocthrust REQUIRED)
find_package(hipcub REQUIRED)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
endif (USE_HIP)
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))) (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
@@ -209,6 +249,10 @@ if (USE_NCCL)
find_package(Nccl REQUIRED) find_package(Nccl REQUIRED)
endif (USE_NCCL) endif (USE_NCCL)
if (USE_RCCL)
find_package(rccl REQUIRED)
endif (USE_RCCL)
# dmlc-core # dmlc-core
msvc_use_static_runtime() msvc_use_static_runtime()
if (FORCE_SHARED_CRT) if (FORCE_SHARED_CRT)

View File

@@ -1,7 +1,7 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 2.0.2.1 Version: 2.0.1.1
Date: 2023-10-12 Date: 2023-10-12
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),

18
R-package/configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for xgboost 2.0.2. # Generated by GNU Autoconf 2.71 for xgboost 2.0.1.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='xgboost' PACKAGE_NAME='xgboost'
PACKAGE_TARNAME='xgboost' PACKAGE_TARNAME='xgboost'
PACKAGE_VERSION='2.0.2' PACKAGE_VERSION='2.0.1'
PACKAGE_STRING='xgboost 2.0.2' PACKAGE_STRING='xgboost 2.0.1'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures xgboost 2.0.2 to adapt to many kinds of systems. \`configure' configures xgboost 2.0.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1287,7 +1287,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of xgboost 2.0.2:";; short | recursive ) echo "Configuration of xgboost 2.0.1:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1367,7 +1367,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
xgboost configure 2.0.2 xgboost configure 2.0.1
generated by GNU Autoconf 2.71 generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc. Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by xgboost $as_me 2.0.2, which was It was created by xgboost $as_me 2.0.1, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by xgboost $as_me 2.0.2, which was This file was extended by xgboost $as_me 2.0.1, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
xgboost config.status 2.0.2 xgboost config.status 2.0.1
configured by $0, generated by GNU Autoconf 2.71, configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -2,7 +2,7 @@
AC_PREREQ(2.69) AC_PREREQ(2.69)
AC_INIT([xgboost],[2.0.2],[],[xgboost],[]) AC_INIT([xgboost],[2.0.1],[],[xgboost],[])
: ${R_HOME=`R RHOME`} : ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then if test -z "${R_HOME}"; then

View File

@@ -32,7 +32,7 @@ namespace common {
bool CheckNAN(double v) { bool CheckNAN(double v) {
return ISNAN(v); return ISNAN(v);
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
double LogGamma(double v) { double LogGamma(double v) {
return lgammafn(v); return lgammafn(v);
} }

View File

@@ -181,6 +181,23 @@ function(xgboost_set_cuda_flags target)
CUDA_RUNTIME_LIBRARY Static) CUDA_RUNTIME_LIBRARY Static)
endfunction(xgboost_set_cuda_flags) endfunction(xgboost_set_cuda_flags)
# Set HIP related flags to target.
function(xgboost_set_hip_flags target)
if (USE_DEVICE_DEBUG)
target_compile_options(${target} PRIVATE
$<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:HIP>>:-G>)
endif (USE_DEVICE_DEBUG)
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)
set_target_properties(${target} PROPERTIES
HIP_STANDARD 17
HIP_STANDARD_REQUIRED ON
HIP_SEPARABLE_COMPILATION OFF)
endfunction(xgboost_set_hip_flags)
macro(xgboost_link_nccl target) macro(xgboost_link_nccl target)
if (BUILD_STATIC_LIB) if (BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR}) target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
@@ -193,6 +210,20 @@ macro(xgboost_link_nccl target)
endif (BUILD_STATIC_LIB) endif (BUILD_STATIC_LIB)
endmacro(xgboost_link_nccl) endmacro(xgboost_link_nccl)
macro(xgboost_link_rccl target)
if(BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${RCCL_INCLUDE_DIR}/rccl)
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1)
target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
target_link_libraries(${target} PUBLIC ${RCCL_LIBRARY})
else()
target_include_directories(${target} PRIVATE ${RCCL_INCLUDE_DIR}/rccl)
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1)
target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
endif()
endmacro()
# compile options # compile options
macro(xgboost_target_properties target) macro(xgboost_target_properties target)
set_target_properties(${target} PROPERTIES set_target_properties(${target} PROPERTIES
@@ -215,6 +246,10 @@ macro(xgboost_target_properties target)
-Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined, -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
-Wall -Wextra -Wno-expansion-to-defined> -Wall -Wextra -Wno-expansion-to-defined>
) )
target_compile_options(${target} PUBLIC
$<IF:$<COMPILE_LANGUAGE:HIP>,
-Wall -Wextra >
)
endif(ENABLE_ALL_WARNINGS) endif(ENABLE_ALL_WARNINGS)
target_compile_options(${target} target_compile_options(${target}
@@ -283,6 +318,10 @@ macro(xgboost_target_link_libraries target)
target_link_libraries(${target} PUBLIC CUDA::cudart_static) target_link_libraries(${target} PUBLIC CUDA::cudart_static)
endif (USE_CUDA) endif (USE_CUDA)
if (USE_HIP)
xgboost_set_hip_flags(${target})
endif (USE_HIP)
if (PLUGIN_RMM) if (PLUGIN_RMM)
target_link_libraries(${target} PRIVATE rmm::rmm) target_link_libraries(${target} PRIVATE rmm::rmm)
endif (PLUGIN_RMM) endif (PLUGIN_RMM)
@@ -291,6 +330,10 @@ macro(xgboost_target_link_libraries target)
xgboost_link_nccl(${target}) xgboost_link_nccl(${target})
endif (USE_NCCL) endif (USE_NCCL)
if(USE_RCCL)
xgboost_link_rccl(${target})
endif()
if (USE_NVTX) if (USE_NVTX)
target_link_libraries(${target} PRIVATE CUDA::nvToolsExt) target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
endif (USE_NVTX) endif (USE_NVTX)

View File

@@ -3,6 +3,8 @@
set(USE_OPENMP @USE_OPENMP@) set(USE_OPENMP @USE_OPENMP@)
set(USE_CUDA @USE_CUDA@) set(USE_CUDA @USE_CUDA@)
set(USE_NCCL @USE_NCCL@) set(USE_NCCL @USE_NCCL@)
set(USE_HIP @USE_HIP@)
set(USE_RCCL @USE_RCCL@)
set(XGBOOST_BUILD_STATIC_LIB @BUILD_STATIC_LIB@) set(XGBOOST_BUILD_STATIC_LIB @BUILD_STATIC_LIB@)
include(CMakeFindDependencyMacro) include(CMakeFindDependencyMacro)
@@ -15,6 +17,9 @@ if (XGBOOST_BUILD_STATIC_LIB)
if(USE_CUDA) if(USE_CUDA)
find_dependency(CUDA) find_dependency(CUDA)
endif() endif()
if(USE_HIP)
find_dependency(HIP)
endif()
# nccl should be linked statically if xgboost is built as static library. # nccl should be linked statically if xgboost is built as static library.
endif (XGBOOST_BUILD_STATIC_LIB) endif (XGBOOST_BUILD_STATIC_LIB)

View File

@@ -4,13 +4,13 @@ python mapfeat.py
# split train and test # split train and test
python mknfold.py machine.txt 1 python mknfold.py machine.txt 1
# training and output the models # training and output the models
../../xgboost machine.conf ../../../xgboost machine.conf
# output predictions of test data # output predictions of test data
../../xgboost machine.conf task=pred model_in=0002.model ../../../xgboost machine.conf task=pred model_in=0002.model
# print the boosters of 0002.model in dump.raw.txt # print the boosters of 0002.model in dump.raw.txt
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt ../../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
# print the boosters of 0002.model in dump.nice.txt with feature map # print the boosters of 0002.model in dump.nice.txt with feature map
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt ../../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
# cat the result # cat the result
cat dump.nice.txt cat dump.nice.txt

View File

@@ -21,14 +21,12 @@ def normpath(path):
else: else:
return normalized return normalized
def cp(source, target): def cp(source, target):
source = normpath(source) source = normpath(source)
target = normpath(target) target = normpath(target)
print("cp {0} {1}".format(source, target)) print("cp {0} {1}".format(source, target))
shutil.copy(source, target) shutil.copy(source, target)
def maybe_makedirs(path): def maybe_makedirs(path):
path = normpath(path) path = normpath(path)
print("mkdir -p " + path) print("mkdir -p " + path)
@@ -38,7 +36,6 @@ def maybe_makedirs(path):
if e.errno != errno.EEXIST: if e.errno != errno.EEXIST:
raise raise
@contextmanager @contextmanager
def cd(path): def cd(path):
path = normpath(path) path = normpath(path)
@@ -50,22 +47,18 @@ def cd(path):
finally: finally:
os.chdir(cwd) os.chdir(cwd)
def run(command, **kwargs): def run(command, **kwargs):
print(command) print(command)
subprocess.check_call(command, shell=True, **kwargs) subprocess.check_call(command, shell=True, **kwargs)
def get_current_git_tag(): def get_current_git_tag():
out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]) out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"])
return out.decode().split("\n")[0] return out.decode().split("\n")[0]
def get_current_commit_hash(): def get_current_commit_hash():
out = subprocess.check_output(["git", "rev-parse", "HEAD"]) out = subprocess.check_output(["git", "rev-parse", "HEAD"])
return out.decode().split("\n")[0] return out.decode().split("\n")[0]
def get_current_git_branch(): def get_current_git_branch():
out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"]) out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"])
m = re.search(r"release_[0-9\.]+", out.decode()) m = re.search(r"release_[0-9\.]+", out.decode())
@@ -73,53 +66,38 @@ def get_current_git_branch():
raise ValueError("Expected branch name of form release_xxx") raise ValueError("Expected branch name of form release_xxx")
return m.group(0) return m.group(0)
def retrieve(url, filename=None): def retrieve(url, filename=None):
print(f"{url} -> {filename}") print(f"{url} -> {filename}")
return urlretrieve(url, filename) return urlretrieve(url, filename)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument("--release-version", type=str, required=True,
"--release-version", help="Version of the release being prepared")
type=str,
required=True,
help="Version of the release being prepared",
)
args = parser.parse_args() args = parser.parse_args()
if sys.platform != "darwin" or platform.machine() != "arm64": if sys.platform != "darwin" or platform.machine() != "x86_64":
raise NotImplementedError("Please run this script using an M1 Mac") raise NotImplementedError("Please run this script using an Intel Mac")
version = args.release_version version = args.release_version
expected_git_tag = "v" + version expected_git_tag = "v" + version
current_git_tag = get_current_git_tag() current_git_tag = get_current_git_tag()
if current_git_tag != expected_git_tag: if current_git_tag != expected_git_tag:
if not current_git_tag: if not current_git_tag:
raise ValueError( raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
f"Expected git tag {expected_git_tag} but current HEAD has no tag. " f"Run: git checkout {expected_git_tag}")
f"Run: git checkout {expected_git_tag}" raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD is at tag "
) f"{current_git_tag}. Run: git checkout {expected_git_tag}")
raise ValueError(
f"Expected git tag {expected_git_tag} but current HEAD is at tag "
f"{current_git_tag}. Run: git checkout {expected_git_tag}"
)
commit_hash = get_current_commit_hash() commit_hash = get_current_commit_hash()
git_branch = get_current_git_branch() git_branch = get_current_git_branch()
print( print(f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}")
f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}"
)
with cd("jvm-packages/"): with cd("jvm-packages/"):
print("====copying pure-Python tracker====") print("====copying pure-Python tracker====")
for use_cuda in [True, False]: for use_cuda in [True, False]:
xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j" xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
cp( cp("../python-package/xgboost/tracker.py", f"{xgboost4j}/src/main/resources")
"../python-package/xgboost/tracker.py",
f"{xgboost4j}/src/main/resources",
)
print("====copying resources for testing====") print("====copying resources for testing====")
with cd("../demo/CLI/regression"): with cd("../demo/CLI/regression"):
@@ -137,11 +115,7 @@ def main():
cp(file, f"{xgboost4j_spark}/src/test/resources") cp(file, f"{xgboost4j_spark}/src/test/resources")
print("====Creating directories to hold native binaries====") print("====Creating directories to hold native binaries====")
for os_ident, arch in [ for os_ident, arch in [("linux", "x86_64"), ("windows", "x86_64"), ("macos", "x86_64")]:
("linux", "x86_64"),
("windows", "x86_64"),
("macos", "x86_64"),
]:
output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}" output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
maybe_makedirs(output_dir) maybe_makedirs(output_dir)
for os_ident, arch in [("linux", "x86_64")]: for os_ident, arch in [("linux", "x86_64")]:
@@ -149,86 +123,52 @@ def main():
maybe_makedirs(output_dir) maybe_makedirs(output_dir)
print("====Downloading native binaries from CI====") print("====Downloading native binaries from CI====")
nightly_bucket_prefix = ( nightly_bucket_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
"https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds" maven_repo_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
)
maven_repo_prefix = (
"https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
)
retrieve( retrieve(url=f"{nightly_bucket_prefix}/{git_branch}/xgboost4j_{commit_hash}.dll",
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll", filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll")
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
# libxgboost4j.so for Linux x86_64, CPU only # libxgboost4j.so for Linux x86_64, CPU only
zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar") zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j") extract_dir = os.path.join(tempdir, "xgboost4j")
retrieve( retrieve(url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
f"xgboost4j_2.12-{version}.jar", f"xgboost4j_2.12-{version}.jar",
filename=zip_path, filename=zip_path)
)
os.mkdir(extract_dir) os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t: with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir) t.extractall(extract_dir)
cp( cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"), "xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
# libxgboost4j.so for Linux x86_64, GPU support # libxgboost4j.so for Linux x86_64, GPU support
zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar") zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j-gpu") extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
retrieve( retrieve(url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
f"xgboost4j-gpu_2.12-{version}.jar", f"xgboost4j-gpu_2.12-{version}.jar",
filename=zip_path, filename=zip_path)
)
os.mkdir(extract_dir) os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t: with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir) t.extractall(extract_dir)
cp( cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"), "xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
print("====Next Steps====") print("====Next Steps====")
print("1. Gain upload right to Maven Central repo.") print("1. Gain upload right to Maven Central repo.")
print("1-1. Sign up for a JIRA account at Sonatype: ") print("1-1. Sign up for a JIRA account at Sonatype: ")
print( print("1-2. File a JIRA ticket: "
"1-2. File a JIRA ticket: "
"https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: " "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
"https://issues.sonatype.org/browse/OSSRH-67724" "https://issues.sonatype.org/browse/OSSRH-67724")
) print("2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
print( "https://central.sonatype.org/publish/publish-maven/")
"2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in " print("3. Now on a Mac machine, run:")
"https://central.sonatype.org/publish/publish-maven/"
)
print(
"3. Now on a M1 Mac machine, run the following to build Scala 2.12 artifacts:"
)
print(" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests") print(" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests")
print( print("4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
"4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging " "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 "
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
"to inspect the staged JAR files. Finally, press Release button to publish the " "to inspect the staged JAR files. Finally, press Release button to publish the "
"artifacts to the Maven Central repository. The top-level metapackage should be " "artifacts to the Maven Central repository.")
"named xgboost-jvm_2.12."
)
print("5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:")
print(" rm -rf targets/")
print(" GPG_TTY=$(tty) mvn deploy -Prelease-cpu-only,scala-2.13 -DskipTests")
print(
"6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts."
"The top-level metapackage should be named xgboost-jvm_2.13."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -58,19 +58,19 @@
/*! /*!
* \brief Tag function as usable by device * \brief Tag function as usable by device
*/ */
#if defined (__CUDA__) || defined(__NVCC__) #if defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#define XGBOOST_DEVICE __host__ __device__ #define XGBOOST_DEVICE __host__ __device__
#else #else
#define XGBOOST_DEVICE #define XGBOOST_DEVICE
#endif // defined (__CUDA__) || defined(__NVCC__) #endif // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#if defined(__CUDA__) || defined(__CUDACC__) #if defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
#define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__ #define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
#define XGBOOST_DEV_INLINE __device__ __forceinline__ #define XGBOOST_DEV_INLINE __device__ __forceinline__
#else #else
#define XGBOOST_HOST_DEV_INLINE #define XGBOOST_HOST_DEV_INLINE
#define XGBOOST_DEV_INLINE #define XGBOOST_DEV_INLINE
#endif // defined(__CUDA__) || defined(__CUDACC__) #endif // defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
// These check are for Makefile. // These check are for Makefile.
#if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
@@ -274,8 +274,8 @@ class GradientPairInt64 {
GradientPairInt64(GradientPairInt64 const &g) = default; GradientPairInt64(GradientPairInt64 const &g) = default;
GradientPairInt64 &operator=(GradientPairInt64 const &g) = default; GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;
XGBOOST_DEVICE [[nodiscard]] T GetQuantisedGrad() const { return grad_; } [[nodiscard]] XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
XGBOOST_DEVICE [[nodiscard]] T GetQuantisedHess() const { return hess_; } [[nodiscard]] XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }
XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) { XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
grad_ += rhs.grad_; grad_ += rhs.grad_;

View File

@@ -58,11 +58,11 @@
namespace xgboost { namespace xgboost {
#ifdef __CUDACC__ #if defined(__CUDACC__) || defined(__HIPCC__)
// Sets a function to call instead of cudaSetDevice(); // Sets a function to call instead of cudaSetDevice();
// only added for testing // only added for testing
void SetCudaSetDeviceHandler(void (*handler)(int)); void SetCudaSetDeviceHandler(void (*handler)(int));
#endif // __CUDACC__ #endif // __CUDACC__ || __HIPCC__
template <typename T> struct HostDeviceVectorImpl; template <typename T> struct HostDeviceVectorImpl;

View File

@@ -30,11 +30,11 @@
// decouple it from xgboost. // decouple it from xgboost.
#ifndef LINALG_HD #ifndef LINALG_HD
#if defined(__CUDA__) || defined(__NVCC__) #if defined(__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#define LINALG_HD __host__ __device__ #define LINALG_HD __host__ __device__
#else #else
#define LINALG_HD #define LINALG_HD
#endif // defined (__CUDA__) || defined(__NVCC__) #endif // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#endif // LINALG_HD #endif // LINALG_HD
namespace xgboost::linalg { namespace xgboost::linalg {
@@ -118,7 +118,7 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,
template <int32_t n, typename Fn> template <int32_t n, typename Fn>
LINALG_HD constexpr auto UnrollLoop(Fn fn) { LINALG_HD constexpr auto UnrollLoop(Fn fn) {
#if defined __CUDA_ARCH__ #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
#pragma unroll n #pragma unroll n
#endif // defined __CUDA_ARCH__ #endif // defined __CUDA_ARCH__
for (int32_t i = 0; i < n; ++i) { for (int32_t i = 0; i < n; ++i) {
@@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
inline LINALG_HD int Popc(uint32_t v) { inline LINALG_HD int Popc(uint32_t v) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
return __popc(v); return __popc(v);
#elif defined(__GNUC__) || defined(__clang__) #elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
return __builtin_popcount(v); return __builtin_popcount(v);
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
return __popcnt(v); return __popcnt(v);
@@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
inline LINALG_HD int Popc(uint64_t v) { inline LINALG_HD int Popc(uint64_t v) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
return __popcll(v); return __popcll(v);
#elif defined(__GNUC__) || defined(__clang__) #elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
return __builtin_popcountll(v); return __builtin_popcountll(v);
#elif defined(_MSC_VER) && defined(_M_X64) #elif defined(_MSC_VER) && defined(_M_X64)
return __popcnt64(v); return __popcnt64(v);

View File

@@ -41,7 +41,9 @@
#if defined(__CUDACC__) #if defined(__CUDACC__)
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif // defined(__CUDACC__) #elif defined(__HIPCC__)
#include <hip/hip_runtime.h>
#endif
/*! /*!
* The version number 1910 is picked up from GSL. * The version number 1910 is picked up from GSL.
@@ -104,7 +106,42 @@ namespace common {
#define SPAN_CHECK KERNEL_CHECK #define SPAN_CHECK KERNEL_CHECK
#else // ------------------------------ not CUDA ---------------------------- #elif defined(__HIPCC__)
// Usual logging facility is not available inside device code.
#if defined(_MSC_VER)
// Windows HIP doesn't have __assert_fail.
#define HIP_KERNEL_CHECK(cond) \
do { \
if (XGBOOST_EXPECT(!(cond), false)) { \
__builtin_trap(); \
} \
} while (0)
#else // defined(_MSC_VER)
#define __ASSERT_STR_HELPER(x) #x
#if 0
#define HIP_KERNEL_CHECK(cond) \
(XGBOOST_EXPECT((cond), true) \
? static_cast<void>(0) \
: __assert_fail(__ASSERT_STR_HELPER((cond)), __FILE__, __LINE__, __PRETTY_FUNCTION__))
#else
#define HIP_KERNEL_CHECK(cond) \
(XGBOOST_EXPECT((cond), true) \
? static_cast<void>(0) \
: __builtin_trap())
#endif
#endif // defined(_MSC_VER)
#define KERNEL_CHECK HIP_KERNEL_CHECK
#define SPAN_CHECK KERNEL_CHECK
#else // ------------------------------ not CUDA or HIP ----------------------------
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1 #if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
@@ -120,7 +157,7 @@ namespace common {
#endif // defined(XGBOOST_STRICT_R_MODE) #endif // defined(XGBOOST_STRICT_R_MODE)
#endif // __CUDA_ARCH__ #endif // __CUDA_ARCH__ || __HIPCC__
#define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs)) #define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs))
@@ -317,7 +354,7 @@ struct IsSpanOracle<Span<T, Extent>> : std::true_type {};
template <class T> template <class T>
struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {}; struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {};
// Re-implement std algorithms here to adopt CUDA. // Re-implement std algorithms here to adopt CUDA/HIP
template <typename T> template <typename T>
struct Less { struct Less {
XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const { XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const {

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 2 /* NOLINT */ #define XGBOOST_VER_MAJOR 2 /* NOLINT */
#define XGBOOST_VER_MINOR 0 /* NOLINT */ #define XGBOOST_VER_MINOR 0 /* NOLINT */
#define XGBOOST_VER_PATCH 2 /* NOLINT */ #define XGBOOST_VER_PATCH 1 /* NOLINT */
#endif // XGBOOST_VERSION_CONFIG_H_ #endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -9,6 +9,11 @@ if (USE_CUDA)
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu) ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
endif (USE_CUDA) endif (USE_CUDA)
if (USE_HIP)
list(APPEND JVM_SOURCES
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip)
endif (USE_HIP)
add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES}) add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
if (ENABLE_ALL_WARNINGS) if (ENABLE_ALL_WARNINGS)

View File

@@ -22,6 +22,8 @@ CONFIG = {
"USE_CUDA": "OFF", "USE_CUDA": "OFF",
"USE_NCCL": "OFF", "USE_NCCL": "OFF",
"USE_HIP": "OFF",
"USE_RCCL": "OFF",
"JVM_BINDINGS": "ON", "JVM_BINDINGS": "ON",
"LOG_CAPI_INVOCATION": "OFF" "LOG_CAPI_INVOCATION": "OFF"
} }
@@ -74,6 +76,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF') parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF') parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
parser.add_argument('--use-hip', type=str, choices=['ON', 'OFF'], default='OFF')
cli_args = parser.parse_args() cli_args = parser.parse_args()
if sys.platform == "darwin": if sys.platform == "darwin":
@@ -84,7 +87,7 @@ if __name__ == "__main__":
print("building Java wrapper") print("building Java wrapper")
with cd(".."): with cd(".."):
build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build' build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip == 'ON' else 'build'
maybe_makedirs(build_dir) maybe_makedirs(build_dir)
with cd(build_dir): with cd(build_dir):
if sys.platform == "win32": if sys.platform == "win32":
@@ -103,6 +106,9 @@ if __name__ == "__main__":
if cli_args.use_cuda == 'ON': if cli_args.use_cuda == 'ON':
CONFIG['USE_CUDA'] = 'ON' CONFIG['USE_CUDA'] = 'ON'
CONFIG['USE_NCCL'] = 'ON' CONFIG['USE_NCCL'] = 'ON'
elif cli_args.use_hip== 'ON':
CONFIG['USE_HIP'] = 'ON'
CONFIG['USE_RCCL'] = 'ON'
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()] args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
@@ -125,8 +131,8 @@ if __name__ == "__main__":
run(f'"{sys.executable}" mapfeat.py') run(f'"{sys.executable}" mapfeat.py')
run(f'"{sys.executable}" mknfold.py machine.txt 1') run(f'"{sys.executable}" mknfold.py machine.txt 1')
xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j' xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip== 'ON' else 'xgboost4j'
xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark' xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip == 'ON' else 'xgboost4j-spark'
print("copying native library") print("copying native library")
library_name, os_folder = { library_name, os_folder = {

View File

@@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>
@@ -43,6 +43,7 @@
<maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count> <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
<log.capi.invocation>OFF</log.capi.invocation> <log.capi.invocation>OFF</log.capi.invocation>
<use.cuda>OFF</use.cuda> <use.cuda>OFF</use.cuda>
<use.hip>OFF</use.hip>
<cudf.version>23.08.0</cudf.version> <cudf.version>23.08.0</cudf.version>
<spark.rapids.version>23.08.0</spark.rapids.version> <spark.rapids.version>23.08.0</spark.rapids.version>
<cudf.classifier>cuda11</cudf.classifier> <cudf.classifier>cuda11</cudf.classifier>
@@ -189,93 +190,6 @@
</plugins> </plugins>
</build> </build>
</profile> </profile>
<profile>
<id>release-cpu-only</id>
<modules>
<module>xgboost4j</module>
<module>xgboost4j-example</module>
<module>xgboost4j-spark</module>
<module>xgboost4j-flink</module>
</modules>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>empty-javadoc-jar</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>javadoc</classifier>
<classesDirectory>${basedir}/javadoc</classesDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.1</version>
<configuration>
<autoVersionSubmodules>true</autoVersionSubmodules>
<useReleaseProfile>false</useReleaseProfile>
<releaseProfiles>release</releaseProfiles>
<goals>deploy</goals>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile> <profile>
<id>assembly</id> <id>assembly</id>
<build> <build>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
</parent> </parent>
<name>xgboost4j-example</name> <name>xgboost4j-example</name>
<artifactId>xgboost4j-example_${scala.binary.version}</artifactId> <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>

View File

@@ -5,13 +5,13 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
</parent> </parent>
<name>xgboost4j-flink</name> <name>xgboost4j-flink</name>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
<properties> <properties>
<flink-ml.version>2.2.0</flink-ml.version> <flink-ml.version>2.2.0</flink-ml.version>
</properties> </properties>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<name>xgboost4j-gpu</name> <name>xgboost4j-gpu</name>
<version>2.0.2</version> <version>2.0.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>
@@ -104,6 +104,8 @@
<argument>${log.capi.invocation}</argument> <argument>${log.capi.invocation}</argument>
<argument>--use-cuda</argument> <argument>--use-cuda</argument>
<argument>${use.cuda}</argument> <argument>${use.cuda}</argument>
<argument>--use-hip</argument>
<argument>${use.hip}</argument>
</arguments> </arguments>
<workingDirectory>${user.dir}</workingDirectory> <workingDirectory>${user.dir}</workingDirectory>
</configuration> </configuration>

View File

@@ -2,7 +2,7 @@
// Created by bobwang on 2021/9/8. // Created by bobwang on 2021/9/8.
// //
#ifndef XGBOOST_USE_CUDA #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
#include <jni.h> #include <jni.h>

View File

@@ -1,6 +1,10 @@
#include <jni.h> #include <jni.h>
#if defined(XGBOOST_USE_CUDA)
#include "../../../../src/common/device_helpers.cuh" #include "../../../../src/common/device_helpers.cuh"
#elif defined(XGBOOST_USE_HIP)
#include "../../../../src/common/device_helpers.hip.h"
#endif
#include "../../../../src/common/cuda_pinned_allocator.h" #include "../../../../src/common/cuda_pinned_allocator.h"
#include "../../../../src/data/array_interface.h" #include "../../../../src/data/array_interface.h"
#include "jvm_utils.h" #include "jvm_utils.h"

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "xgboost4j-gpu.cu"
#endif

View File

@@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
</parent> </parent>
<name>xgboost4j-spark-gpu</name> <name>xgboost4j-spark-gpu</name>
<artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>

View File

@@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
</parent> </parent>
<name>xgboost4j-spark</name> <name>xgboost4j-spark</name>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_${scala.binary.version}</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
</parent> </parent>
<name>xgboost4j</name> <name>xgboost4j</name>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>2.0.2</version> <version>2.0.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -15,6 +15,10 @@ class BuildConfiguration: # pylint: disable=R0902
use_cuda: bool = False use_cuda: bool = False
# Whether to enable NCCL # Whether to enable NCCL
use_nccl: bool = False use_nccl: bool = False
# Whether to enablea HIP
use_hip: bool = False
# Whether to enable RCCL
use_rccl: bool = False
# Whether to enable HDFS # Whether to enable HDFS
use_hdfs: bool = False use_hdfs: bool = False
# Whether to enable Azure Storage # Whether to enable Azure Storage

View File

@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
[project] [project]
name = "xgboost" name = "xgboost"
version = "2.0.2" version = "2.0.1"
authors = [ authors = [
{ name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" }, { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" } { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }

View File

@@ -1 +1 @@
2.0.2 2.0.1

View File

@@ -2093,17 +2093,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
""" """
X, qid = _get_qid(X, None) X, qid = _get_qid(X, None)
# fixme(jiamingy): base margin and group weight is not yet supported. We might Xyq = DMatrix(X, y, qid=qid)
# need to make extra special fields in the dataframe.
Xyq = DMatrix(
X,
y,
qid=qid,
missing=self.missing,
enable_categorical=self.enable_categorical,
nthread=self.n_jobs,
feature_types=self.feature_types,
)
if callable(self.eval_metric): if callable(self.eval_metric):
metric = ltr_metric_decorator(self.eval_metric, self.n_jobs) metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric) result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)

View File

@@ -75,28 +75,3 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
with pytest.raises(ValueError, match="Either `group` or `qid`."): with pytest.raises(ValueError, match="Either `group` or `qid`."):
ranker.fit(df, y, eval_set=[(X, y)]) ranker.fit(df, y, eval_set=[(X, y)])
def run_ranking_categorical(device: str) -> None:
"""Test LTR with categorical features."""
from sklearn.model_selection import cross_val_score
X, y = tm.make_categorical(
n_samples=512, n_features=10, n_categories=3, onehot=False
)
rng = np.random.default_rng(1994)
qid = rng.choice(3, size=y.shape[0])
qid = np.sort(qid)
X["qid"] = qid
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
ltr.fit(X, y)
score = ltr.score(X, y)
assert score > 0.9
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
# test using the score function inside sklearn.
scores = cross_val_score(ltr, X, y)
for s in scores:
assert s > 0.7

1
rocgputreeshap Submodule

Submodule rocgputreeshap added at 2fea6734e8

View File

@@ -16,6 +16,11 @@ if (USE_CUDA)
target_sources(objxgboost PRIVATE ${CUDA_SOURCES}) target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
endif (USE_CUDA) endif (USE_CUDA)
if (USE_HIP)
file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
target_sources(objxgboost PRIVATE ${HIP_SOURCES})
endif (USE_HIP)
target_include_directories(objxgboost target_include_directories(objxgboost
PRIVATE PRIVATE
${xgboost_SOURCE_DIR}/include ${xgboost_SOURCE_DIR}/include
@@ -33,6 +38,7 @@ msvc_use_static_runtime()
# This grouping organises source files nicely in visual studio # This grouping organises source files nicely in visual studio
auto_source_group("${CUDA_SOURCES}") auto_source_group("${CUDA_SOURCES}")
auto_source_group("${HIP_SOURCES}")
auto_source_group("${CPU_SOURCES}") auto_source_group("${CPU_SOURCES}")
#-- End object library #-- End object library

View File

@@ -70,12 +70,14 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>; using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
namespace xgboost { namespace xgboost {
void XGBBuildInfoDevice(Json *p_info) { void XGBBuildInfoDevice(Json *p_info) {
auto &info = *p_info; auto &info = *p_info;
info["USE_CUDA"] = Boolean{false}; info["USE_CUDA"] = Boolean{false};
info["USE_NCCL"] = Boolean{false}; info["USE_NCCL"] = Boolean{false};
info["USE_HIP"] = Boolean{false};
info["USE_RCCL"] = Boolean{false};
info["USE_RMM"] = Boolean{false}; info["USE_RMM"] = Boolean{false};
} }
} // namespace xgboost } // namespace xgboost
@@ -279,7 +281,7 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
API_END(); API_END();
} }
#ifndef XGBOOST_USE_CUDA #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *, char const *, DMatrixHandle *) { XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *, char const *, DMatrixHandle *) {
API_BEGIN(); API_BEGIN();
common::AssertGPUSupport(); common::AssertGPUSupport();
@@ -1189,7 +1191,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
API_END(); API_END();
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *, XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *,
DMatrixHandle, xgboost::bst_ulong const **, DMatrixHandle, xgboost::bst_ulong const **,
xgboost::bst_ulong *, const float **) { xgboost::bst_ulong *, const float **) {

View File

@@ -17,7 +17,11 @@ namespace xgboost {
void XGBBuildInfoDevice(Json *p_info) { void XGBBuildInfoDevice(Json *p_info) {
auto &info = *p_info; auto &info = *p_info;
#if defined(XGBOOST_USE_CUDA)
info["USE_CUDA"] = true; info["USE_CUDA"] = true;
#elif defined(XGBOOST_USE_HIP)
info["USE_HIP"] = true;
#endif
std::vector<Json> v{Json{Integer{THRUST_MAJOR_VERSION}}, Json{Integer{THRUST_MINOR_VERSION}}, std::vector<Json> v{Json{Integer{THRUST_MAJOR_VERSION}}, Json{Integer{THRUST_MINOR_VERSION}},
Json{Integer{THRUST_SUBMINOR_VERSION}}}; Json{Integer{THRUST_SUBMINOR_VERSION}}};
@@ -30,8 +34,13 @@ void XGBBuildInfoDevice(Json *p_info) {
info["USE_NCCL"] = Boolean{true}; info["USE_NCCL"] = Boolean{true};
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}}; v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
info["NCCL_VERSION"] = v; info["NCCL_VERSION"] = v;
#elif defined(XGBOOST_USE_RCCL)
info["USE_RCCL"] = Boolean{true};
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
info["RCCL_VERSION"] = v;
#else #else
info["USE_NCCL"] = Boolean{false}; info["USE_NCCL"] = Boolean{false};
info["USE_RCCL"] = Boolean{false};
#endif #endif
#if defined(XGBOOST_USE_RMM) #if defined(XGBOOST_USE_RMM)

4
src/c_api/c_api.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "c_api.cu"
#endif

View File

@@ -175,7 +175,7 @@ inline float GetMissing(Json const &config) {
// Safe guard some global variables from being changed by XGBoost. // Safe guard some global variables from being changed by XGBoost.
class XGBoostAPIGuard { class XGBoostAPIGuard {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
int32_t device_id_ {0}; int32_t device_id_ {0};
void SetGPUAttribute(); void SetGPUAttribute();

View File

@@ -51,7 +51,7 @@ void Communicator::Init(Json const& config) {
} }
} }
#ifndef XGBOOST_USE_CUDA #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
void Communicator::Finalize() { void Communicator::Finalize() {
communicator_->Shutdown(); communicator_->Shutdown();
communicator_.reset(new NoOpCommunicator()); communicator_.reset(new NoOpCommunicator());

View File

@@ -5,7 +5,7 @@
#include "device_communicator.cuh" #include "device_communicator.cuh"
#include "device_communicator_adapter.cuh" #include "device_communicator_adapter.cuh"
#include "noop_communicator.h" #include "noop_communicator.h"
#ifdef XGBOOST_USE_NCCL #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
#include "nccl_device_communicator.cuh" #include "nccl_device_communicator.cuh"
#endif #endif
@@ -28,7 +28,7 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
communicator_->GetWorldSize() != old_world_size) { communicator_->GetWorldSize() != old_world_size) {
old_device_ordinal = device_ordinal; old_device_ordinal = device_ordinal;
old_world_size = communicator_->GetWorldSize(); old_world_size = communicator_->GetWorldSize();
#ifdef XGBOOST_USE_NCCL #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
switch (type_) { switch (type_) {
case CommunicatorType::kRabit: case CommunicatorType::kRabit:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false)); device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));

View File

@@ -98,7 +98,7 @@ class Communicator {
/** @brief Get the communicator instance. */ /** @brief Get the communicator instance. */
static Communicator *Get() { return communicator_.get(); } static Communicator *Get() { return communicator_.get(); }
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
/** /**
* @brief Get the device communicator. * @brief Get the device communicator.
* *
@@ -230,7 +230,7 @@ class Communicator {
static thread_local std::unique_ptr<Communicator> communicator_; static thread_local std::unique_ptr<Communicator> communicator_;
static thread_local CommunicatorType type_; static thread_local CommunicatorType type_;
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_; static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
#endif #endif

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "communicator.cu"
#endif

View File

@@ -41,7 +41,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
} }
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
segments->clear(); segments->clear();
segments->resize(world_size_, 0); segments->resize(world_size_, 0);
segments->at(rank_) = length_bytes; segments->at(rank_) = length_bytes;
@@ -60,6 +59,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
Broadcast(host_buffer_.data() + offset, as_bytes, i); Broadcast(host_buffer_.data() + offset, as_bytes, i);
offset += as_bytes; offset += as_bytes;
} }
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes, dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
cudaMemcpyDefault)); cudaMemcpyDefault));
} }

View File

@@ -1,7 +1,7 @@
/*! /*!
* Copyright 2023 XGBoost contributors * Copyright 2023 XGBoost contributors
*/ */
#if defined(XGBOOST_USE_NCCL) #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
#include "nccl_device_communicator.cuh" #include "nccl_device_communicator.cuh"
namespace xgboost { namespace xgboost {

View File

@@ -35,12 +35,22 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
private: private:
static constexpr std::size_t kUuidLength = static constexpr std::size_t kUuidLength =
#if defined(XGBOOST_USE_CUDA)
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t); sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
#elif defined(XGBOOST_USE_HIP)
sizeof(hipUUID) / sizeof(uint64_t);
#endif
void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const { void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
#if defined(XGBOOST_USE_CUDA)
cudaDeviceProp prob{}; cudaDeviceProp prob{};
dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_)); dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid)); std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
#elif defined(XGBOOST_USE_HIP)
hipUUID id;
hipDeviceGetUuid(&id, device_ordinal_);
std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
#endif
} }
static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) { static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) {

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "nccl_device_communicator.cu"
#endif

View File

@@ -10,7 +10,13 @@
#include <cstddef> // size_t #include <cstddef> // size_t
#include <cstdint> // int32_t #include <cstdint> // int32_t
#if defined(XGBOOST_USE_HIP)
#include <hipcub/hipcub.hpp>
#elif defined(XGBOOST_USE_CUDA)
#include <cub/cub.cuh> // DispatchSegmentedRadixSort,NullType,DoubleBuffer #include <cub/cub.cuh> // DispatchSegmentedRadixSort,NullType,DoubleBuffer
#endif
#include <iterator> // distance #include <iterator> // distance
#include <limits> // numeric_limits #include <limits> // numeric_limits
#include <type_traits> // conditional_t,remove_const_t #include <type_traits> // conditional_t,remove_const_t
@@ -39,6 +45,7 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
using OffsetT = int; using OffsetT = int;
// Null value type // Null value type
#if defined(XGBOOST_USE_CUDA)
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
cub::DoubleBuffer<cub::NullType> d_values; cub::DoubleBuffer<cub::NullType> d_values;
@@ -47,6 +54,20 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
num_segments, d_begin_offsets, d_end_offsets, begin_bit, num_segments, d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, ctx->Stream(), debug_synchronous))); end_bit, false, ctx->Stream(), debug_synchronous)));
#elif defined(XGBOOST_USE_HIP)
if (IS_DESCENDING) {
rocprim::segmented_radix_sort_pairs_desc<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit, ctx->Stream(), debug_synchronous);
}
else {
rocprim::segmented_radix_sort_pairs<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit, ctx->Stream(), debug_synchronous);
}
#endif
} }
// Wrapper around cub sort for easier `descending` sort. // Wrapper around cub sort for easier `descending` sort.
@@ -60,14 +81,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
BeginOffsetIteratorT d_begin_offsets, BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream, EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
#if defined(XGBOOST_USE_CUDA)
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out); cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
#endif
// In old version of cub, num_items in dispatch is also int32_t, no way to change. // In old version of cub, num_items in dispatch is also int32_t, no way to change.
using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(), using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
std::size_t, std::int32_t>; std::size_t, std::int32_t>;
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max()); CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
#if defined(XGBOOST_USE_CUDA)
#if THRUST_MAJOR_VERSION >= 2 #if THRUST_MAJOR_VERSION >= 2
dh::safe_cuda((cub::DispatchSegmentedRadixSort< dh::safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
@@ -88,6 +113,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
d_begin_offsets, d_end_offsets, begin_bit, d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, stream, false))); end_bit, false, stream, false)));
#endif #endif
#elif defined(XGBOOST_USE_HIP)
if (descending) {
rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
d_values_in, d_values_out, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream, false);
}
else {
rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit, stream, false);
}
#endif
} }
} // namespace detail } // namespace detail

View File

@@ -17,14 +17,18 @@
#include <thrust/copy.h> #include <thrust/copy.h>
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
#include "device_helpers.cuh" #include "device_helpers.cuh"
#endif // defined(__CUDACC__) #elif defined(__HIPCC__)
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include "device_helpers.hip.h"
#endif // defined(__CUDACC__) || defined(__HIPCC__)
#include "xgboost/span.h" #include "xgboost/span.h"
#include "common.h" #include "common.h"
namespace xgboost { namespace xgboost {
#if defined(__CUDACC__) #if defined(__CUDACC__) || defined(__HIPCC__)
using BitFieldAtomicType = unsigned long long; // NOLINT using BitFieldAtomicType = unsigned long long; // NOLINT
__forceinline__ __device__ BitFieldAtomicType AtomicOr(BitFieldAtomicType* address, __forceinline__ __device__ BitFieldAtomicType AtomicOr(BitFieldAtomicType* address,
@@ -48,7 +52,7 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
return old; return old;
} }
#endif // defined(__CUDACC__) #endif // defined(__CUDACC__) || defined(__HIPCC__)
/** /**
* @brief A non-owning type with auxiliary methods defined for manipulating bits. * @brief A non-owning type with auxiliary methods defined for manipulating bits.
@@ -106,7 +110,7 @@ struct BitFieldContainer {
XGBOOST_DEVICE static size_t ComputeStorageSize(index_type size) { XGBOOST_DEVICE static size_t ComputeStorageSize(index_type size) {
return common::DivRoundUp(size, kValueSize); return common::DivRoundUp(size, kValueSize);
} }
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) { __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t min_size = min(NumValues(), rhs.NumValues()); size_t min_size = min(NumValues(), rhs.NumValues());
@@ -123,9 +127,9 @@ struct BitFieldContainer {
} }
return *this; return *this;
} }
#endif // #if defined(__CUDA_ARCH__) #endif // #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
__device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) { __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
size_t min_size = min(NumValues(), rhs.NumValues()); size_t min_size = min(NumValues(), rhs.NumValues());
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -144,7 +148,7 @@ struct BitFieldContainer {
} }
#endif // defined(__CUDA_ARCH__) #endif // defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
__device__ auto Set(index_type pos) noexcept(true) { __device__ auto Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos)); Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = Data()[pos_v.int_pos]; value_type& value = Data()[pos_v.int_pos];
@@ -159,6 +163,14 @@ struct BitFieldContainer {
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type; using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit); atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
} }
#ifdef __HIPCC__
void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
value &= clear_bit;
}
#endif
#else #else
void Set(index_type pos) noexcept(true) { void Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos)); Pos pos_v = Direction::Shift(ToBitPos(pos));
@@ -172,7 +184,7 @@ struct BitFieldContainer {
value_type clear_bit = ~(kOne << pos_v.bit_pos); value_type clear_bit = ~(kOne << pos_v.bit_pos);
value &= clear_bit; value &= clear_bit;
} }
#endif // defined(__CUDA_ARCH__) #endif // defined(__CUDA_ARCH__) || defined(__HIPCC__)
XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) { XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
pos_v = Direction::Shift(pos_v); pos_v = Direction::Shift(pos_v);

View File

@@ -55,7 +55,7 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
} }
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
int AllVisibleGPUs() { return 0; } int AllVisibleGPUs() { return 0; }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA)

View File

@@ -2,6 +2,7 @@
* Copyright 2018-2022 XGBoost contributors * Copyright 2018-2022 XGBoost contributors
*/ */
#include "common.h" #include "common.h"
#include "cuda_to_hip.h"
namespace xgboost { namespace xgboost {
namespace common { namespace common {

View File

@@ -26,6 +26,12 @@
#define WITH_CUDA() true #define WITH_CUDA() true
#elif defined(__HIPCC__)
#include <thrust/system/hip/error.h>
#include <thrust/system_error.h>
#define WITH_CUDA() true
#else #else
#define WITH_CUDA() false #define WITH_CUDA() false
@@ -39,8 +45,8 @@ namespace dh {
*/ */
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__) #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, int line)
int line) { {
if (code != cudaSuccess) { if (code != cudaSuccess) {
LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(), LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
std::string{file} + ": " + // NOLINT std::string{file} + ": " + // NOLINT
@@ -48,7 +54,23 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
} }
return code; return code;
} }
#endif // defined(__CUDACC__)
#elif defined(__HIPCC__)
/*
* Error handling functions
*/
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
inline hipError_t ThrowOnCudaError(hipError_t code, const char *file, int line)
{
if (code != hipSuccess) {
LOG(FATAL) << thrust::system_error(code, thrust::hip_category(),
std::string{file} + ": " + // NOLINT
std::to_string(line)).what();
}
return code;
}
#endif
} // namespace dh } // namespace dh
namespace xgboost::common { namespace xgboost::common {
@@ -159,9 +181,9 @@ class Range {
int AllVisibleGPUs(); int AllVisibleGPUs();
inline void AssertGPUSupport() { inline void AssertGPUSupport() {
#ifndef XGBOOST_USE_CUDA #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
LOG(FATAL) << "XGBoost version not compiled with GPU support."; LOG(FATAL) << "XGBoost version not compiled with GPU support.";
#endif // XGBOOST_USE_CUDA #endif // XGBOOST_USE_CUDA && XGBOOST_USE_HIP
} }
inline void AssertOneAPISupport() { inline void AssertOneAPISupport() {
@@ -172,7 +194,7 @@ inline void AssertOneAPISupport() {
void SetDevice(std::int32_t device); void SetDevice(std::int32_t device);
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void SetDevice(std::int32_t device) { inline void SetDevice(std::int32_t device) {
if (device >= 0) { if (device >= 0) {
AssertGPUSupport(); AssertGPUSupport();

4
src/common/common.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "common.cu"
#endif

View File

@@ -11,9 +11,11 @@
#include "common.h" #include "common.h"
#ifdef __CUDACC__ #if defined(__CUDACC__)
#include "device_helpers.cuh" #include "device_helpers.cuh"
#endif // __CUDACC__ #elif defined(__HIPCC__)
#include "device_helpers.hip.h"
#endif // __CUDACC__ || __HIPCC__
namespace xgboost { namespace xgboost {
namespace common { namespace common {
@@ -105,7 +107,7 @@ class CompressedBufferWriter {
} }
} }
#ifdef __CUDACC__ #if defined(__CUDACC__) || defined(__HIPCC__)
__device__ void AtomicWriteSymbol __device__ void AtomicWriteSymbol
(CompressedByteT* buffer, uint64_t symbol, size_t offset) { (CompressedByteT* buffer, uint64_t symbol, size_t offset) {
size_t ibit_start = offset * symbol_bits_; size_t ibit_start = offset * symbol_bits_;
@@ -119,7 +121,7 @@ class CompressedBufferWriter {
symbol >>= 8; symbol >>= 8;
} }
} }
#endif // __CUDACC__ #endif // __CUDACC__ || __HIPCC__
template <typename IterT> template <typename IterT>
void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) { void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {

View File

@@ -4,7 +4,6 @@
#ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_ #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
#define XGBOOST_COMMON_CUDA_CONTEXT_CUH_ #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include "device_helpers.cuh" #include "device_helpers.cuh"
namespace xgboost { namespace xgboost {

View File

@@ -72,11 +72,23 @@ class pinned_allocator {
if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if
pointer result(nullptr); pointer result(nullptr);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
#else
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type))); dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
#endif
return result; return result;
} }
inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT inline void deallocate(pointer p, size_type) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipHostFree(p));
#else
dh::safe_cuda(cudaFreeHost(p));
#endif
} // NOLINT
inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT

71
src/common/cuda_to_hip.h Normal file
View File

@@ -0,0 +1,71 @@
/**
* Copyright 2017-2023 XGBoost contributors
*/
#pragma once
#if defined(XGBOOST_USE_HIP)
#define cudaSuccess hipSuccess
#define cudaGetLastError hipGetLastError
#define cudaStream_t hipStream_t
#define cudaStreamCreate hipStreamCreate
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamLegacy hipStreamLegacy
#define cudaEvent_t hipEvent_t
#define cudaEventCreate hipEventCreate
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDestroy hipEventDestroy
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#define cudaMallocHost hipMallocHost
#define cudaFreeHost hipFreeHost
#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDefault hipMemcpyDefault
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyHostToHost hipMemcpyHostToHost
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaMemGetInfo hipMemGetInfo
#define cudaFuncSetAttribute hipFuncSetAttribute
#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount
#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
namespace thrust {
namespace hip {
}
}
namespace thrust {
namespace cuda = thrust::hip;
}
namespace hipcub {
}
namespace cub = hipcub;
#endif

View File

@@ -2,6 +2,8 @@
* Copyright 2017-2023 XGBoost contributors * Copyright 2017-2023 XGBoost contributors
*/ */
#pragma once #pragma once
#if defined(XGBOOST_USE_CUDA)
#include <thrust/binary_search.h> // thrust::upper_bound #include <thrust/binary_search.h> // thrust::upper_bound
#include <thrust/device_malloc_allocator.h> #include <thrust/device_malloc_allocator.h>
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
@@ -1218,3 +1220,7 @@ class LDGIterator {
} }
}; };
} // namespace dh } // namespace dh
#elif defined(XGBOOST_USE_HIP)
#include "device_helpers.hip.h"
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -84,10 +84,19 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
template <std::uint32_t kBlockThreads, typename Kernel> template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) { std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0; int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
#endif
int n_blocks_per_mp = 0; int n_blocks_per_mp = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem)); kBlockThreads, shared_mem));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
#endif
std::uint32_t grid_size = n_blocks_per_mp * n_mps; std::uint32_t grid_size = n_blocks_per_mp * n_mps;
return grid_size; return grid_size;
} }
@@ -170,6 +179,7 @@ void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per
column_sizes_scan->begin(), [=] __device__(size_t column_size) { column_sizes_scan->begin(), [=] __device__(size_t column_size) {
return thrust::min(num_cuts_per_feature, column_size); return thrust::min(num_cuts_per_feature, column_size);
}); });
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it, thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(), thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
@@ -294,6 +304,7 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
&column_sizes_scan, &column_sizes_scan,
&sorted_entries); &sorted_entries);
dh::XGBDeviceAllocator<char> alloc; dh::XGBDeviceAllocator<char> alloc;
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(), thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), detail::EntryCompareOp()); sorted_entries.end(), detail::EntryCompareOp());
@@ -353,11 +364,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx); bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
return weights[group_idx]; return weights[group_idx];
}); });
auto retit = thrust::copy_if(thrust::cuda::par(alloc), auto retit = thrust::copy_if(thrust::cuda::par(alloc),
weight_iter + begin, weight_iter + end, weight_iter + begin, weight_iter + end,
batch_iter + begin, batch_iter + begin,
d_temp_weights.data(), // output d_temp_weights.data(), // output
is_valid); is_valid);
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size()); CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
} else { } else {
CHECK_EQ(batch.NumRows(), weights.size()); CHECK_EQ(batch.NumRows(), weights.size());
@@ -366,11 +379,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
[=]__device__(size_t idx) -> float { [=]__device__(size_t idx) -> float {
return weights[batch.GetElement(idx).row_idx]; return weights[batch.GetElement(idx).row_idx];
}); });
auto retit = thrust::copy_if(thrust::cuda::par(alloc), auto retit = thrust::copy_if(thrust::cuda::par(alloc),
weight_iter + begin, weight_iter + end, weight_iter + begin, weight_iter + end,
batch_iter + begin, batch_iter + begin,
d_temp_weights.data(), // output d_temp_weights.data(), // output
is_valid); is_valid);
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size()); CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
} }

4
src/common/hist_util.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "hist_util.cu"
#endif

View File

@@ -1,7 +1,7 @@
/** /**
* Copyright 2017-2023 by XGBoost contributors * Copyright 2017-2023 by XGBoost contributors
*/ */
#ifndef XGBOOST_USE_CUDA #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
// dummy implementation of HostDeviceVector in case CUDA is not used // dummy implementation of HostDeviceVector in case CUDA is not used
@@ -199,4 +199,4 @@ template class HostDeviceVector<std::size_t>;
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_USE_CUDA #endif // XGBOOST_USE_CUDA && !defined(XGBOOST_USE_HIP)

View File

@@ -139,6 +139,7 @@ class HostDeviceVectorImpl {
auto ptr = other->ConstDevicePointer(); auto ptr = other->ConstDevicePointer();
SetDevice(); SetDevice();
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx()); CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr, ptr,
other->Size() * sizeof(T), other->Size() * sizeof(T),
@@ -195,6 +196,7 @@ class HostDeviceVectorImpl {
gpu_access_ = access; gpu_access_ = access;
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
SetDevice(); SetDevice();
dh::safe_cuda(cudaMemcpy(data_h_.data(), dh::safe_cuda(cudaMemcpy(data_h_.data(),
data_d_->data().get(), data_d_->data().get(),
data_d_->size() * sizeof(T), data_d_->size() * sizeof(T),
@@ -211,6 +213,7 @@ class HostDeviceVectorImpl {
// data is on the host // data is on the host
LazyResizeDevice(data_h_.size()); LazyResizeDevice(data_h_.size());
SetDevice(); SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
data_h_.data(), data_h_.data(),
data_d_->size() * sizeof(T), data_d_->size() * sizeof(T),
@@ -239,6 +242,7 @@ class HostDeviceVectorImpl {
LazyResizeDevice(Size()); LazyResizeDevice(Size());
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
SetDevice(); SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
data_d_->size() * sizeof(T), cudaMemcpyDefault)); data_d_->size() * sizeof(T), cudaMemcpyDefault));
} }
@@ -248,6 +252,7 @@ class HostDeviceVectorImpl {
LazyResizeDevice(Size()); LazyResizeDevice(Size());
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
SetDevice(); SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin, dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
data_d_->size() * sizeof(T), cudaMemcpyDefault)); data_d_->size() * sizeof(T), cudaMemcpyDefault));
} }

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "host_device_vector.cu"
#endif

View File

@@ -12,7 +12,8 @@
namespace xgboost { namespace xgboost {
namespace linalg { namespace linalg {
template <typename T, int32_t D, typename Fn> template <typename T, int32_t D, typename Fn>
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) { void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
{
dh::safe_cuda(cudaSetDevice(t.DeviceIdx())); dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value, static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
"For function with return, use transform instead."); "For function with return, use transform instead.");
@@ -28,7 +29,8 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s
} }
template <typename T, int32_t D, typename Fn> template <typename T, int32_t D, typename Fn>
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) { void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
{
if (t.Contiguous()) { if (t.Contiguous()) {
auto ptr = t.Values().data(); auto ptr = t.Values().data();
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); }); dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });

View File

@@ -42,7 +42,7 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& f
} }
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
template <typename T, int32_t D, typename Fn> template <typename T, int32_t D, typename Fn>
void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) { void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
common::AssertGPUSupport(); common::AssertGPUSupport();
@@ -60,7 +60,7 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
} }
ElementWiseKernelHost(t, ctx->Threads(), fn); ElementWiseKernelHost(t, ctx->Threads(), fn);
} }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
template <typename T, std::int32_t kDim> template <typename T, std::int32_t kDim>
auto cbegin(TensorView<T, kDim> const& v) { // NOLINT auto cbegin(TensorView<T, kDim> const& v) { // NOLINT

View File

@@ -143,7 +143,7 @@ CheckNAN(T) {
return false; return false;
} }
#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__) #if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
bool CheckNAN(double v); bool CheckNAN(double v);
@@ -152,23 +152,27 @@ bool CheckNAN(double v);
XGBOOST_DEVICE bool inline CheckNAN(float x) { XGBOOST_DEVICE bool inline CheckNAN(float x) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
return isnan(x); return isnan(x);
#elif defined(__HIPCC__)
return __builtin_isnan(x);
#else #else
return std::isnan(x); return std::isnan(x);
#endif // defined(__CUDA_ARCH__) #endif // defined(__CUDA_ARCH__) || defined(__HIPCC__)
} }
XGBOOST_DEVICE bool inline CheckNAN(double x) { XGBOOST_DEVICE bool inline CheckNAN(double x) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
return isnan(x); return isnan(x);
#elif defined(__HIPCC__)
return __builtin_isnan(x);
#else #else
return std::isnan(x); return std::isnan(x);
#endif // defined(__CUDA_ARCH__) #endif // defined(__CUDA_ARCH__) || defined(__HIPCC__)
} }
#endif // XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__) #endif // XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
// GPU version is not uploaded in CRAN anyway. // GPU version is not uploaded in CRAN anyway.
// Specialize only when using R with CPU. // Specialize only when using R with CPU.
#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) #if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
double LogGamma(double v); double LogGamma(double v);
#else // Not R or R with GPU. #else // Not R or R with GPU.
@@ -191,7 +195,7 @@ XGBOOST_DEVICE inline T LogGamma(T v) {
#endif // _MSC_VER #endif // _MSC_VER
} }
#endif // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) #endif // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace common } // namespace common
} // namespace xgboost } // namespace xgboost

View File

@@ -15,6 +15,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
values.SetDevice(ctx->gpu_id); values.SetDevice(ctx->gpu_id);
auto const d_values = values.ConstDeviceSpan(); auto const d_values = values.ConstDeviceSpan();
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0, return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
thrust::plus<float>{}); thrust::plus<float>{});
} }

View File

@@ -99,12 +99,12 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
namespace cuda_impl { namespace cuda_impl {
double Reduce(Context const* ctx, HostDeviceVector<float> const& values); double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline double Reduce(Context const*, HostDeviceVector<float> const&) { inline double Reduce(Context const*, HostDeviceVector<float> const&) {
AssertGPUSupport(); AssertGPUSupport();
return 0; return 0;
} }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace cuda_impl } // namespace cuda_impl
/** /**

4
src/common/numeric.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "numeric.cu"
#endif

View File

@@ -109,6 +109,7 @@ template <typename T, typename U>
void CopyTo(Span<T> out, Span<U> src) { void CopyTo(Span<T> out, Span<U> src) {
CHECK_EQ(out.size(), src.size()); CHECK_EQ(out.size(), src.size());
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value); static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(), dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
out.size_bytes(), out.size_bytes(),
cudaMemcpyDefault)); cudaMemcpyDefault));
@@ -162,6 +163,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// Compute output ptr // Compute output ptr
auto transform_it = auto transform_it =
thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data())); thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
thrust::transform( thrust::transform(
thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(), thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
out_ptr.data(), out_ptr.data(),
@@ -211,6 +213,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y, Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) { Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
CHECK_EQ(d_x.size() + d_y.size(), out.size()); CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size()); CHECK_EQ(x_ptr.size(), out_ptr.size());
CHECK_EQ(y_ptr.size(), out_ptr.size()); CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -309,6 +312,7 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) { size_t total_cuts, Span<float> weights) {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
Span<SketchEntry> out; Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts; dh::device_vector<SketchEntry> cuts;
bool first_window = this->Current().empty(); bool first_window = this->Current().empty();
@@ -378,6 +382,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
}); });
// Reverse scan to accumulate weights into first duplicated element on left. // Reverse scan to accumulate weights into first duplicated element on left.
auto val_it = thrust::make_reverse_iterator(dh::tend(entries)); auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
thrust::inclusive_scan_by_key( thrust::inclusive_scan_by_key(
thrust::cuda::par(alloc), key_it, key_it + entries.size(), thrust::cuda::par(alloc), key_it, key_it + entries.size(),
val_it, val_it, val_it, val_it,
@@ -443,6 +448,7 @@ void SketchContainer::Prune(size_t to) {
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr, void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) { Span<SketchEntry const> that) {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
timer_.Start(__func__); timer_.Start(__func__);
if (this->Current().size() == 0) { if (this->Current().size() == 0) {
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0); CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -478,6 +484,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
void SketchContainer::FixError() { void SketchContainer::FixError() {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan(); auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
auto in = dh::ToSpan(this->Current()); auto in = dh::ToSpan(this->Current());
dh::LaunchN(in.size(), [=] __device__(size_t idx) { dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -635,10 +642,12 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1); CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
max_values.resize(d_in_columns_ptr.size() - 1); max_values.resize(d_in_columns_ptr.size() - 1);
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1); dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
thrust::make_discard_iterator(), d_max_values.begin(), thrust::make_discard_iterator(), d_max_values.begin(),
thrust::equal_to<bst_feature_t>{}, thrust::equal_to<bst_feature_t>{},
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; }); [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values)); dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
auto max_it = MakeIndexTransformIter([&](auto i) { auto max_it = MakeIndexTransformIter([&](auto i) {
if (IsCat(h_feature_types, i)) { if (IsCat(h_feature_types, i)) {

View File

@@ -175,6 +175,7 @@ class SketchContainer {
template <typename KeyComp = thrust::equal_to<size_t>> template <typename KeyComp = thrust::equal_to<size_t>>
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) { size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__); timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
this->columns_ptr_.SetDevice(device_); this->columns_ptr_.SetDevice(device_);
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan(); Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
@@ -186,11 +187,13 @@ class SketchContainer {
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
d_column_scan = this->columns_ptr_.DeviceSpan(); d_column_scan = this->columns_ptr_.DeviceSpan();
size_t n_uniques = dh::SegmentedUnique( size_t n_uniques = dh::SegmentedUnique(
thrust::cuda::par(alloc), d_column_scan.data(), thrust::cuda::par(alloc), d_column_scan.data(),
d_column_scan.data() + d_column_scan.size(), entries.data(), d_column_scan.data() + d_column_scan.size(), entries.data(),
entries.data() + entries.size(), scan_out.DevicePointer(), entries.data() + entries.size(), scan_out.DevicePointer(),
entries.data(), detail::SketchUnique{}, key_comp); entries.data(), detail::SketchUnique{}, key_comp);
this->columns_ptr_.Copy(scan_out); this->columns_ptr_.Copy(scan_out);
CHECK(!this->columns_ptr_.HostCanRead()); CHECK(!this->columns_ptr_.HostCanRead());

4
src/common/quantile.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "quantile.cu"
#endif

View File

@@ -62,7 +62,7 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
return rank; return rank;
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); } void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*, common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
common::Span<float const>) { common::Span<float const>) {
@@ -108,9 +108,9 @@ void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
}); });
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); } void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
DMLC_REGISTER_PARAMETER(LambdaRankParam); DMLC_REGISTER_PARAMETER(LambdaRankParam);
@@ -120,7 +120,7 @@ void PreCache::InitOnCPU(Context const*, MetaInfo const& info) {
[](auto beg, auto end, auto op) { return std::all_of(beg, end, op); }); [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); } void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA)
@@ -130,9 +130,9 @@ void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
[](auto beg, auto end, auto op) { return std::all_of(beg, end, op); }); [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); } void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) { std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
std::string out_name; std::string out_name;

View File

@@ -23,6 +23,10 @@
#include "xgboost/logging.h" // for CHECK #include "xgboost/logging.h" // for CHECK
#include "xgboost/span.h" // for Span #include "xgboost/span.h" // for Span
#if defined(XGBOOST_USE_HIP)
#include <hipcub/hipcub.hpp>
#endif
namespace xgboost::ltr { namespace xgboost::ltr {
namespace cuda_impl { namespace cuda_impl {
void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels, void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "ranking_utils.cu"
#endif

View File

@@ -18,6 +18,7 @@
namespace xgboost { namespace xgboost {
namespace common { namespace common {
namespace cuda_impl { namespace cuda_impl {
void Median(Context const* ctx, linalg::TensorView<float const, 2> t, void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) { common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
CHECK_GE(t.Shape(1), 1); CHECK_GE(t.Shape(1), 1);

View File

@@ -216,6 +216,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
detail::SegOp<SegIt>{seg_beg, seg_end}); detail::SegOp<SegIt>{seg_beg, seg_end});
auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul), auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
detail::WeightOp<WIter>{w_begin, d_sorted_idx}); detail::WeightOp<WIter>{w_begin, d_sorted_idx});
thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights, thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
scan_val, weights_cdf.begin()); scan_val, weights_cdf.begin());

View File

@@ -112,7 +112,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWe
void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out); void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights, inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights,
linalg::Tensor<float, 1>*) { linalg::Tensor<float, 1>*) {
common::AssertGPUSupport(); common::AssertGPUSupport();
@@ -120,7 +120,7 @@ inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalW
inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) { inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
common::AssertGPUSupport(); common::AssertGPUSupport();
} }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace cuda_impl } // namespace cuda_impl
/** /**

4
src/common/stats.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "stats.cu"
#endif

View File

@@ -25,12 +25,12 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::common::ProbabilityDistributionType);
namespace xgboost { namespace xgboost {
namespace common { namespace common {
#ifndef __CUDACC__ #if !defined(__CUDACC__) && !defined(__HIPCC__)
using std::log; using std::log;
using std::fmax; using std::fmax;
#endif // __CUDACC__ #endif // __CUDACC__ && __HIPCC__
enum class CensoringType : uint8_t { enum class CensoringType : uint8_t {
kUncensored, kRightCensored, kLeftCensored, kIntervalCensored kUncensored, kRightCensored, kLeftCensored, kIntervalCensored

View File

@@ -59,9 +59,11 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
}); });
dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(), dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
out_group_threads_ptr.size()); out_group_threads_ptr.size());
std::size_t total = 0; size_t total = 0;
dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
sizeof(total), cudaMemcpyDeviceToHost)); sizeof(total), cudaMemcpyDeviceToHost));
return total; return total;
} }

View File

@@ -19,7 +19,9 @@
#if defined (__CUDACC__) #if defined (__CUDACC__)
#include "device_helpers.cuh" #include "device_helpers.cuh"
#endif // defined (__CUDACC__) #elif defined(__HIPCC__)
#include "device_helpers.hip.h"
#endif // defined (__CUDACC__) || defined(__HIPCC__)
namespace xgboost { namespace xgboost {
namespace common { namespace common {
@@ -28,7 +30,7 @@ constexpr size_t kBlockThreads = 256;
namespace detail { namespace detail {
#if defined(__CUDACC__) #if defined(__CUDACC__) || defined(__HIPCC__)
template <typename Functor, typename... SpanType> template <typename Functor, typename... SpanType>
__global__ void LaunchCUDAKernel(Functor _func, Range _range, __global__ void LaunchCUDAKernel(Functor _func, Range _range,
SpanType... _spans) { SpanType... _spans) {
@@ -36,7 +38,7 @@ __global__ void LaunchCUDAKernel(Functor _func, Range _range,
_func(i, _spans...); _func(i, _spans...);
} }
} }
#endif // defined(__CUDACC__) #endif // defined(__CUDACC__) || defined(__HIPCC__)
} // namespace detail } // namespace detail
@@ -127,7 +129,7 @@ class Transform {
UnpackShard(device, _vectors...); UnpackShard(device, _vectors...);
} }
#if defined(__CUDACC__) #if defined(__CUDACC__) || defined(__HIPCC__)
template <typename std::enable_if<CompiledWithCuda>::type* = nullptr, template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
typename... HDV> typename... HDV>
void LaunchCUDA(Functor _func, HDV*... _vectors) const { void LaunchCUDA(Functor _func, HDV*... _vectors) const {
@@ -140,7 +142,9 @@ class Transform {
// granularity is used in data vector. // granularity is used in data vector.
size_t shard_size = range_size; size_t shard_size = range_size;
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)}; Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
const int kGrids = const int kGrids =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads)); static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
if (kGrids == 0) { if (kGrids == 0) {
@@ -159,7 +163,7 @@ class Transform {
LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA(); LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA();
} }
#endif // defined(__CUDACC__) #endif // defined(__CUDACC__) || defined(__HIPCC__)
template <typename... HDV> template <typename... HDV>
void LaunchCPU(Functor func, HDV *...vectors) const { void LaunchCPU(Functor func, HDV *...vectors) const {

View File

@@ -28,7 +28,7 @@ Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
namespace { namespace {
inline constexpr char const* kDevice = "device"; inline constexpr char const* kDevice = "device";
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
device = DeviceOrd::CPU(); device = DeviceOrd::CPU();
return device; return device;
@@ -208,10 +208,10 @@ std::int32_t Context::Threads() const {
return n_threads; return n_threads;
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
CUDAContext const* Context::CUDACtx() const { CUDAContext const* Context::CUDACtx() const {
common::AssertGPUSupport(); common::AssertGPUSupport();
return nullptr; return nullptr;
} }
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
} // namespace xgboost } // namespace xgboost

4
src/context.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "context.cu"
#endif

View File

@@ -20,7 +20,9 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
* case where 0 might be given should either use None, 1, or 2 instead for * case where 0 might be given should either use None, 1, or 2 instead for
* clarity. * clarity.
*/ */
#ifndef XGBOOST_USE_HIP
LOG(FATAL) << "Invalid stream ID in array interface: " << stream; LOG(FATAL) << "Invalid stream ID in array interface: " << stream;
#endif
case 1: case 1:
// default legacy stream // default legacy stream
break; break;
@@ -38,6 +40,8 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
if (!ptr) { if (!ptr) {
return false; return false;
} }
#if defined(XGBOOST_USE_CUDA)
cudaPointerAttributes attr; cudaPointerAttributes attr;
auto err = cudaPointerGetAttributes(&attr, ptr); auto err = cudaPointerGetAttributes(&attr, ptr);
// reset error // reset error
@@ -59,5 +63,34 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
// other errors, `cudaErrorNoDevice`, `cudaErrorInsufficientDriver` etc. // other errors, `cudaErrorNoDevice`, `cudaErrorInsufficientDriver` etc.
return false; return false;
} }
#elif defined(XGBOOST_USE_HIP)
hipPointerAttribute_t attr;
auto err = hipPointerGetAttributes(&attr, ptr);
// reset error
CHECK_EQ(err, hipGetLastError());
if (err == hipErrorInvalidValue) {
return false;
} else if (err == hipSuccess) {
#if HIP_VERSION_MAJOR < 6
switch (attr.memoryType) {
case hipMemoryTypeHost:
return false;
default:
return true;
}
#else
switch (attr.type) {
case hipMemoryTypeUnregistered:
case hipMemoryTypeHost:
return false;
default:
return true;
}
#endif
return true;
} else {
return false;
}
#endif
} }
} // namespace xgboost } // namespace xgboost

View File

@@ -28,6 +28,8 @@
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
#include "cuda_fp16.h" #include "cuda_fp16.h"
#elif defined(XGBOOST_USE_HIP)
#include <hip/hip_fp16.h>
#endif #endif
namespace xgboost { namespace xgboost {
@@ -308,12 +310,12 @@ class ArrayInterfaceHandler {
template <typename T, typename E = void> template <typename T, typename E = void>
struct ToDType; struct ToDType;
// float // float
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
template <> template <>
struct ToDType<__half> { struct ToDType<__half> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2; static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
}; };
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
template <> template <>
struct ToDType<float> { struct ToDType<float> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4; static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@@ -362,10 +364,10 @@ struct ToDType<int64_t> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8; static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
}; };
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); } inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; } inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
/** /**
* \brief A type erased view over __array_interface__ protocol defined by numpy * \brief A type erased view over __array_interface__ protocol defined by numpy
@@ -463,11 +465,11 @@ class ArrayInterface {
CHECK(sizeof(long double) == 16) << error::NoF128(); CHECK(sizeof(long double) == 16) << error::NoF128();
type = T::kF16; type = T::kF16;
} else if (typestr[1] == 'f' && typestr[2] == '2') { } else if (typestr[1] == 'f' && typestr[2] == '2') {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
type = T::kF2; type = T::kF2;
#else #else
LOG(FATAL) << "Half type is not supported."; LOG(FATAL) << "Half type is not supported.";
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
} else if (typestr[1] == 'f' && typestr[2] == '4') { } else if (typestr[1] == 'f' && typestr[2] == '4') {
type = T::kF4; type = T::kF4;
} else if (typestr[1] == 'f' && typestr[2] == '8') { } else if (typestr[1] == 'f' && typestr[2] == '8') {
@@ -502,15 +504,15 @@ class ArrayInterface {
using T = ArrayInterfaceHandler::Type; using T = ArrayInterfaceHandler::Type;
switch (type) { switch (type) {
case T::kF2: { case T::kF2: {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
return func(reinterpret_cast<__half const *>(data)); return func(reinterpret_cast<__half const *>(data));
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA) || || defined(XGBOOST_USE_HIP)
} }
case T::kF4: case T::kF4:
return func(reinterpret_cast<float const *>(data)); return func(reinterpret_cast<float const *>(data));
case T::kF8: case T::kF8:
return func(reinterpret_cast<double const *>(data)); return func(reinterpret_cast<double const *>(data));
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__ ) || defined(__HIPCC__)
case T::kF16: { case T::kF16: {
// CUDA device code doesn't support long double. // CUDA device code doesn't support long double.
SPAN_CHECK(false); SPAN_CHECK(false);
@@ -557,7 +559,7 @@ class ArrayInterface {
static_assert(sizeof...(index) <= D, "Invalid index."); static_assert(sizeof...(index) <= D, "Invalid index.");
return this->DispatchCall([=](auto const *p_values) -> T { return this->DispatchCall([=](auto const *p_values) -> T {
std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...); std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
// No operator defined for half -> size_t // No operator defined for half -> size_t
using Type = std::conditional_t< using Type = std::conditional_t<
std::is_same<__half, std::is_same<__half,
@@ -567,7 +569,7 @@ class ArrayInterface {
return static_cast<T>(static_cast<Type>(p_values[offset])); return static_cast<T>(static_cast<Type>(p_values[offset]));
#else #else
return static_cast<T>(p_values[offset]); return static_cast<T>(p_values[offset]);
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}); });
} }
@@ -604,7 +606,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
}; };
switch (array.type) { switch (array.type) {
case ArrayInterfaceHandler::kF2: { case ArrayInterfaceHandler::kF2: {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
dispatch(__half{}); dispatch(__half{});
#endif #endif
break; break;

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "array_interface.cu"
#endif

View File

@@ -800,9 +800,9 @@ void MetaInfo::Validate(std::int32_t device) const {
} }
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
void MetaInfo::SetInfoFromCUDA(Context const&, StringView, Json) { common::AssertGPUSupport(); } void MetaInfo::SetInfoFromCUDA(Context const&, StringView, Json) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
bool MetaInfo::IsVerticalFederated() const { bool MetaInfo::IsVerticalFederated() const {
return collective::IsFederated() && IsColumnSplit(); return collective::IsFederated() && IsColumnSplit();

View File

@@ -44,6 +44,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
std::copy(array.shape, array.shape + D, shape.data()); std::copy(array.shape, array.shape + D, shape.data());
// set data // set data
data->Resize(array.n); data->Resize(array.n);
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T), dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
cudaMemcpyDefault, ctx->Stream())); cudaMemcpyDefault, ctx->Stream()));
}); });
@@ -95,8 +96,10 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
} }
}); });
bool non_dec = true; bool non_dec = true;
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool), dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data."; CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
size_t bytes = 0; size_t bytes = 0;
dh::caching_device_vector<uint32_t> out(array_interface.Shape(0)); dh::caching_device_vector<uint32_t> out(array_interface.Shape(0));
@@ -114,8 +117,10 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
group_ptr_.clear(); group_ptr_.clear();
group_ptr_.resize(h_num_runs_out + 1, 0); group_ptr_.resize(h_num_runs_out + 1, 0);
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(), thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(),
cnt.begin() + h_num_runs_out, cnt.begin()); cnt.begin() + h_num_runs_out, cnt.begin());
thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out, thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out,
group_ptr_.begin() + 1); group_ptr_.begin() + 1);
} }

4
src/data/data.hip Normal file
View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "data.cu"
#endif

View File

@@ -122,7 +122,9 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
device_idx_ = dh::CudaGetPointerDevice(first_column.data); device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, Context::kCpuId); CHECK_NE(device_idx_, Context::kCpuId);
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_idx_));
for (auto& json_col : json_columns) { for (auto& json_col : json_columns) {
auto column = ArrayInterface<1>(get<Object const>(json_col)); auto column = ArrayInterface<1>(get<Object const>(json_col));
columns.push_back(column); columns.push_back(column);
@@ -211,6 +213,7 @@ template <typename AdapterBatchT>
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx, std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
float missing) { float missing) {
dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaSetDevice(device_idx));
IsValidFunctor is_valid(missing); IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes())); dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
@@ -244,6 +247,7 @@ std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offs
&offset[ridx]), &offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT static_cast<unsigned long long>(cnt)); // NOLINT
}); });
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
bst_row_t row_stride = bst_row_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),

View File

@@ -1,7 +1,7 @@
/** /**
* Copyright 2019-2023, XGBoost contributors * Copyright 2019-2023, XGBoost contributors
*/ */
#ifndef XGBOOST_USE_CUDA #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
#include "ellpack_page.h" #include "ellpack_page.h"
@@ -52,5 +52,4 @@ size_t EllpackPage::Size() const {
return impl_->Cuts(); return impl_->Cuts();
} }
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
#endif // XGBOOST_USE_CUDA

View File

@@ -18,6 +18,10 @@
#include "gradient_index.h" #include "gradient_index.h"
#include "xgboost/data.h" #include "xgboost/data.h"
#if defined(XGBOOST_USE_HIP)
#include <rocprim/rocprim.hpp>
#endif
namespace xgboost { namespace xgboost {
EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {} EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {}
@@ -102,6 +106,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
row_stride(row_stride), row_stride(row_stride),
n_rows(n_rows) { n_rows(n_rows) {
monitor_.Init("ellpack_page"); monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
monitor_.Start("InitCompressedData"); monitor_.Start("InitCompressedData");
@@ -256,6 +261,8 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
// Go one level down into cub::DeviceScan API to set OffsetT as 64 bit // Go one level down into cub::DeviceScan API to set OffsetT as 64 bit
// So we don't crash on n > 2^31 // So we don't crash on n > 2^31
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#if defined(__CUDACC__)
using DispatchScan = using DispatchScan =
cub::DispatchScan<decltype(key_value_index_iter), decltype(out), cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
TupleScanOp<Tuple>, cub::NullType, int64_t>; TupleScanOp<Tuple>, cub::NullType, int64_t>;
@@ -278,6 +285,17 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
key_value_index_iter, out, TupleScanOp<Tuple>(), key_value_index_iter, out, TupleScanOp<Tuple>(),
cub::NullType(), batch.Size(), nullptr, false); cub::NullType(), batch.Size(), nullptr, false);
#endif #endif
#elif defined (__HIPCC__)
rocprim::inclusive_scan(nullptr, temp_storage_bytes, key_value_index_iter, out, batch.Size(), TupleScanOp<Tuple>());
dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
rocprim::inclusive_scan(temp_storage.data().get(), temp_storage_bytes, key_value_index_iter, out, batch.Size(),
TupleScanOp<Tuple>());
#endif
} }
void WriteNullValues(EllpackPageImpl* dst, int device_idx, void WriteNullValues(EllpackPageImpl* dst, int device_idx,
@@ -534,11 +552,13 @@ void EllpackPageImpl::CreateHistIndices(int device,
// copy data entries to device. // copy data entries to device.
if (row_batch.data.DeviceCanRead()) { if (row_batch.data.DeviceCanRead()) {
auto const& d_data = row_batch.data.ConstDeviceSpan(); auto const& d_data = row_batch.data.ConstDeviceSpan();
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin, entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault)); n_entries * sizeof(Entry), cudaMemcpyDefault));
} else { } else {
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector(); const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin, entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault)); n_entries * sizeof(Entry), cudaMemcpyDefault));

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "ellpack_page.cu"
#endif

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "ellpack_page_raw_format.cu"
#endif

View File

@@ -45,7 +45,7 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
void Fetch() final; void Fetch() final;
}; };
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void EllpackPageSource::Fetch() { inline void EllpackPageSource::Fetch() {
// silent the warning about unused variables. // silent the warning about unused variables.
(void)(row_stride_); (void)(row_stride_);

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "ellpack_page_source.cu"
#endif

View File

@@ -65,12 +65,12 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
max_numeric_bins_per_feat(max_bin_per_feat), max_numeric_bins_per_feat(max_bin_per_feat),
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {} isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
GHistIndexMatrix::GHistIndexMatrix(Context const *, MetaInfo const &, EllpackPage const &, GHistIndexMatrix::GHistIndexMatrix(Context const *, MetaInfo const &, EllpackPage const &,
BatchParam const &) { BatchParam const &) {
common::AssertGPUSupport(); common::AssertGPUSupport();
} }
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
GHistIndexMatrix::~GHistIndexMatrix() = default; GHistIndexMatrix::~GHistIndexMatrix() = default;

View File

@@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "gradient_index.cu"
#endif

View File

@@ -357,7 +357,7 @@ BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(Context const* ctx,
return BatchSet<ExtSparsePage>(begin_iter); return BatchSet<ExtSparsePage>(begin_iter);
} }
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, DataIterHandle, float, inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, DataIterHandle, float,
std::shared_ptr<DMatrix>) { std::shared_ptr<DMatrix>) {
// silent the warning about unused variables. // silent the warning about unused variables.
@@ -377,5 +377,5 @@ inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const*,
inline void GetCutsFromEllpack(EllpackPage const&, common::HistogramCuts*) { inline void GetCutsFromEllpack(EllpackPage const&, common::HistogramCuts*) {
common::AssertGPUSupport(); common::AssertGPUSupport();
} }
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace xgboost::data } // namespace xgboost::data

Some files were not shown because too many files have changed in this diff Show More