revert

remove async
switch to default stream
2024-10-22 01:21:22 +02:00 · 2024-10-22 01:18:12 +02:00 · 2024-10-22 01:14:29 +02:00 · 2024-10-22 00:58:01 +02:00 · 2024-10-22 00:53:56 +02:00 · 2024-10-22 00:51:34 +02:00
230 changed files with 2461 additions and 314 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -5,3 +5,6 @@
 [submodule "gputreeshap"]
 	path = gputreeshap
 	url = https://github.com/rapidsai/gputreeshap.git
+[submodule "rocgputreeshap"]
+	path = rocgputreeshap
+	url = https://github.com/ROCmSoftwarePlatform/rocgputreeshap
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -66,7 +66,7 @@ option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC
 option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
 option(GOOGLE_TEST "Build google tests" OFF)
 option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
-option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF)
+option(USE_DEVICE_DEBUG "Generate CUDA/HIP device debug info." OFF)
 option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
 option(RABIT_MOCK "Build rabit with mock" OFF)
@ -99,6 +99,12 @@ cmake_dependent_option(USE_CUDA_LTO
  "${CMAKE_INTERPROCEDURAL_OPTIMIZATION}"
  "CMAKE_VERSION VERSION_GREATER_EQUAL 3.25;USE_CUDA;CMAKE_INTERPROCEDURAL_OPTIMIZATION"
  OFF)
+## HIP
+option(USE_HIP  "Build with GPU acceleration" OFF)
+option(USE_RCCL "Build with RCCL to enable distributed GPU support." OFF)
+# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
+option(USE_DLOPEN_RCCL "Whether to load nccl dynamically." OFF)
+option(BUILD_WITH_SHARED_RCCL "Build with shared RCCL library." OFF)
 ## Sanitizers
 option(USE_SANITIZER "Use santizer flags" OFF)
 option(SANITIZER_PATH "Path to sanitizes.")
@ -131,6 +137,18 @@ endif()
 if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
  message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.")
 endif()
+if(USE_RCCL AND NOT (USE_HIP))
+    message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
+endif()
+if(BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
+  message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
+endif()
+if(USE_DLOPEN_RCCL AND (NOT USE_RCCL))
+  message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable USE_DLOPEN_RCCL.")
+endif()
+if(USE_DLOPEN_RCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
+  message(SEND_ERROR "`USE_DLOPEN_RCCL` supports only Linux at the moment.")
+endif()
 if(JVM_BINDINGS AND R_LIB)
  message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
 endif()
@ -227,6 +245,25 @@ if(USE_CUDA)
  find_package(CUDAToolkit REQUIRED)
 endif()

+if (USE_HIP)
+  set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
+  # `export CXX=' is ignored by CMake HIP.
+  set(CMAKE_HIP_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  message(STATUS "Configured HIP host compiler: ${CMAKE_HIP_HOST_COMPILER}")
+
+  enable_language(HIP)
+  find_package(hip REQUIRED)
+  find_package(rocthrust REQUIRED)
+  find_package(hipcub REQUIRED)
+
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS}")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
+  #set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
+  add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
+endif (USE_HIP)
+
 if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
    ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
      (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
@ -266,6 +303,10 @@ if(USE_NCCL)
  find_package(Nccl REQUIRED)
 endif()

+if (USE_RCCL)
+  find_package(rccl REQUIRED)
+endif (USE_RCCL)
+
 # dmlc-core
 msvc_use_static_runtime()
 if(FORCE_SHARED_CRT)
--- a/R-package/src/xgboost_custom.cc
+++ b/R-package/src/xgboost_custom.cc
@ -36,7 +36,7 @@ namespace common {
 bool CheckNAN(double v) {
  return ISNAN(v);
 }
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 double LogGamma(double v) {
  return lgammafn(v);
 }
--- a/README-ROCm.md
+++ b/README-ROCm.md
@ -0,0 +1,64 @@
+# ROCm version
+
+ROCm 5.5 and newer
+
+# Code
+Clone the code from our repo
+
+1. `git clone https://github.com/ROCmSoftwarePlatform/xgboost`
+1. `cd xgboost`
+1. `git checkout master-rocm`
+
+or a tag/branch with rocm suffix, such as v2.0.1-rocm
+
+# Submodules
+XGBoost ROCm support requires a few modules, which can be initialized as,
+
+`git submodule update --init --recursive`
+
+# Configure
+The following export may be required for some systems, and the ROCm path depends on installation,
+
+1. `export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:/opt/rocm/lib/cmake:/opt/rocm/lib/cmake/AMDDeviceLibs/`
+1. `mkdir build`
+1. `cd build`
+1. `cmake -DUSE_HIP=ON ../`
+1. or `cmake -DUSE_HIP=1 ../`
+1. or `cmake -DUSE_HIP=1 -DUSE_RCCL=1 ../`
+1. or `cmake -DUSE_HIP=1 -DGOOGLE_TEST=1 ../`
+
+The first command may be optional depending on system configure.
+
+The **USE_HIP** macro enables HIP/ROCm support. **USE_RCCL** enables RCCL. **GOOGLE_TEST** enables Google test.
+
+apt-get install libgtest-dev libgmock-dev
+
+# Compile
+To compile, run command,
+
+`make -j`
+
+# Python Support
+After compilation, XGBoost can be installed as a Python package and supports a wide range of applications,
+
+1. `cd python-package/`
+1. `pip3 install .`
+
+# Use AMD GPUs
+When calling XGBoost, set the parameter `device` to `gpu` or `cuda`. Python sample,
+
+```
+params = dict()
+params["device"] = "gpu"
+params["tree_method"] = "hist"
+...
+```
+
+or
+
+```
+params = dict()
+params["device"] = "cuda"
+params["tree_method"] = "hist"
+...
+```
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -1,6 +1,5 @@
 # Automatically set source group based on folder
 function(auto_source_group SOURCES)
-
  foreach(FILE ${SOURCES})
      get_filename_component(PARENT_DIR "${FILE}" PATH)

@ -171,6 +170,23 @@ function(xgboost_set_cuda_flags target)
  endif()
 endfunction()

+# Set HIP related flags to target.
+function(xgboost_set_hip_flags target)
+  if (USE_DEVICE_DEBUG)
+    target_compile_options(${target} PRIVATE
+      $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:HIP>>:-G>)
+  endif (USE_DEVICE_DEBUG)
+
+  target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
+  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
+  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)
+
+  set_target_properties(${target} PROPERTIES
+    HIP_STANDARD 17
+    HIP_STANDARD_REQUIRED ON
+    HIP_SEPARABLE_COMPILATION OFF)
+endfunction(xgboost_set_hip_flags)
+
 function(xgboost_link_nccl target)
  set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1)
  if(USE_DLOPEN_NCCL)
@ -190,6 +206,27 @@ function(xgboost_link_nccl target)
  endif()
 endfunction()

+function(xgboost_link_rccl target)
+  set(xgboost_rccl_flags -DXGBOOST_USE_RCCL=1)
+  if(USE_DLOPEN_RCCL)
+    list(APPEND xgboost_rccl_flags -DXGBOOST_USE_DLOPEN_RCCL=1)
+  endif()
+
+  if(BUILD_STATIC_LIB)
+    target_include_directories(${target} PUBLIC ${RCCL_INCLUDE_DIR}/rccl)
+    target_compile_definitions(${target} PUBLIC ${xgboost_rccl_flags})
+    target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
+    target_link_libraries(${target} PUBLIC ${RCCL_LIBRARY})
+  else()
+    target_include_directories(${target} PRIVATE ${RCCL_INCLUDE_DIR}/rccl)
+    target_compile_definitions(${target} PRIVATE ${xgboost_rccl_flags})
+    target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
+    if(NOT USE_DLOPEN_RCCL)
+      target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
+    endif()
+  endif()
+endfunction()
+
 # compile options
 macro(xgboost_target_properties target)
  set_target_properties(${target} PROPERTIES
@ -280,6 +317,10 @@ macro(xgboost_target_link_libraries target)
    target_link_libraries(${target} PUBLIC CUDA::cudart_static)
  endif()

+  if (USE_HIP)
+    xgboost_set_hip_flags(${target})
+  endif (USE_HIP)
+
  if(PLUGIN_RMM)
    target_link_libraries(${target} PRIVATE rmm::rmm)
  endif()
@ -288,6 +329,10 @@ macro(xgboost_target_link_libraries target)
    xgboost_link_nccl(${target})
  endif()

+  if(USE_RCCL)
+    xgboost_link_rccl(${target})
+  endif()
+
  if(USE_NVTX)
    target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
  endif()
--- a/cmake/xgboost-config.cmake.in
+++ b/cmake/xgboost-config.cmake.in
@ -3,6 +3,8 @@
 set(USE_OPENMP @USE_OPENMP@)
 set(USE_CUDA @USE_CUDA@)
 set(USE_NCCL @USE_NCCL@)
+set(USE_HIP @USE_HIP@)
+set(USE_RCCL @USE_RCCL@)
 set(XGBOOST_BUILD_STATIC_LIB @BUILD_STATIC_LIB@)

 include(CMakeFindDependencyMacro)
@ -15,6 +17,9 @@ if (XGBOOST_BUILD_STATIC_LIB)
  if(USE_CUDA)
    find_dependency(CUDA)
  endif()
+  if(USE_HIP)
+      find_dependency(HIP)
+  endif()
  # nccl should be linked statically if xgboost is built as static library.
 endif (XGBOOST_BUILD_STATIC_LIB)

--- a/demo/CLI/regression/runexp.sh
+++ b/demo/CLI/regression/runexp.sh
@ -4,13 +4,13 @@ python mapfeat.py
 # split train and test
 python mknfold.py machine.txt 1
 # training and output the models
-../../xgboost machine.conf
+../../../xgboost machine.conf
 # output predictions of test data
-../../xgboost machine.conf task=pred model_in=0002.model
+../../../xgboost machine.conf task=pred model_in=0002.model
 # print the boosters of 0002.model in dump.raw.txt
-../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
+../../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
 # print the boosters of 0002.model in dump.nice.txt with feature map
-../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt 
+../../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt 

 # cat the result
 cat dump.nice.txt
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@ -56,19 +56,19 @@
 /*!
 * \brief Tag function as usable by device
 */
-#if defined (__CUDA__) || defined(__NVCC__)
+#if defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
 #define XGBOOST_DEVICE __host__ __device__
 #else
 #define XGBOOST_DEVICE
-#endif  // defined (__CUDA__) || defined(__NVCC__)
+#endif  // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)

-#if defined(__CUDA__) || defined(__CUDACC__)
+#if defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
 #define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
 #define XGBOOST_DEV_INLINE __device__ __forceinline__
 #else
 #define XGBOOST_HOST_DEV_INLINE
 #define XGBOOST_DEV_INLINE
-#endif  // defined(__CUDA__) || defined(__CUDACC__)
+#endif  // defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)

 // These check are for Makefile.
 #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@ -58,11 +58,11 @@

 namespace xgboost {

-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 // Sets a function to call instead of cudaSetDevice();
 // only added for testing
 void SetCudaSetDeviceHandler(void (*handler)(int));
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__

 template <typename T> struct HostDeviceVectorImpl;

--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -30,11 +30,11 @@

 // decouple it from xgboost.
 #ifndef LINALG_HD
-#if defined(__CUDA__) || defined(__NVCC__)
+#if defined(__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
 #define LINALG_HD __host__ __device__
 #else
 #define LINALG_HD
-#endif  // defined (__CUDA__) || defined(__NVCC__)
+#endif  // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
 #endif  // LINALG_HD

 namespace xgboost::linalg {
@ -118,7 +118,7 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,

 template <int32_t n, typename Fn>
 LINALG_HD constexpr auto UnrollLoop(Fn fn) {
-#if defined __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #pragma unroll n
 #endif  // defined __CUDA_ARCH__
  for (int32_t i = 0; i < n; ++i) {
@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
 inline LINALG_HD int Popc(uint32_t v) {
 #if defined(__CUDA_ARCH__)
  return __popc(v);
-#elif defined(__GNUC__) || defined(__clang__)
+#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
  return __builtin_popcount(v);
 #elif defined(_MSC_VER)
  return __popcnt(v);
@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
 inline LINALG_HD int Popc(uint64_t v) {
 #if defined(__CUDA_ARCH__)
  return __popcll(v);
-#elif defined(__GNUC__) || defined(__clang__)
+#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
  return __builtin_popcountll(v);
 #elif defined(_MSC_VER) && defined(_M_X64)
  return __popcnt64(v);
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@ -40,7 +40,9 @@

 #if defined(__CUDACC__)
 #include <cuda_runtime.h>
-#endif  // defined(__CUDACC__)
+#elif defined(__HIPCC__)
+#include <hip/hip_runtime.h>
+#endif

 /*!
 * The version number 1910 is picked up from GSL.
@ -102,7 +104,42 @@ namespace xgboost::common {

 #define SPAN_CHECK KERNEL_CHECK

-#else  // ------------------------------ not CUDA ----------------------------
+#elif defined(__HIPCC__)
+// Usual logging facility is not available inside device code.
+
+#if defined(_MSC_VER)
+
+// Windows HIP doesn't have __assert_fail.
+#define HIP_KERNEL_CHECK(cond)           \
+  do {                                    \
+    if (XGBOOST_EXPECT(!(cond), false)) { \
+      __builtin_trap();                           \
+    }                                     \
+  } while (0)
+
+#else  // defined(_MSC_VER)
+
+#define __ASSERT_STR_HELPER(x) #x
+
+#if 0
+#define HIP_KERNEL_CHECK(cond)  \
+  (XGBOOST_EXPECT((cond), true) \
+       ? static_cast<void>(0)   \
+       : __assert_fail(__ASSERT_STR_HELPER((cond)), __FILE__, __LINE__, __PRETTY_FUNCTION__))
+#else
+#define HIP_KERNEL_CHECK(cond)  \
+  (XGBOOST_EXPECT((cond), true) \
+       ? static_cast<void>(0)   \
+       : __builtin_trap())
+#endif
+
+#endif  // defined(_MSC_VER)
+
+#define KERNEL_CHECK HIP_KERNEL_CHECK
+
+#define SPAN_CHECK KERNEL_CHECK
+
+#else  // ------------------------------ not CUDA or HIP ----------------------------

 #if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1

@ -118,7 +155,7 @@ namespace xgboost::common {

 #endif  // defined(XGBOOST_STRICT_R_MODE)

-#endif  // __CUDA_ARCH__
+#endif  // __CUDA_ARCH__ || __HIPCC__

 #define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs))

@ -315,7 +352,7 @@ struct IsSpanOracle<Span<T, Extent>> : std::true_type {};
 template <class T>
 struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {};

-// Re-implement std algorithms here to adopt CUDA.
+// Re-implement std algorithms here to adopt CUDA/HIP
 template <typename T>
 struct Less {
  XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const {
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@ -9,6 +9,11 @@ if(USE_CUDA)
    ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
 endif()

+if(USE_HIP)
+  list(APPEND JVM_SOURCES
+    ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip)
+endif()
+
 add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})

 if(ENABLE_ALL_WARNINGS)
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@ -21,6 +21,8 @@ CONFIG = {
    "USE_S3": "OFF",
    "USE_CUDA": "OFF",
    "USE_NCCL": "OFF",
+    "USE_HIP": "OFF",
+    "USE_RCCL": "OFF",
    "JVM_BINDINGS": "ON",
    "LOG_CAPI_INVOCATION": "OFF",
 }
@ -79,7 +81,7 @@ def native_build(args):

    print("building Java wrapper")
    with cd(".."):
-        build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
+        build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip == 'ON' else 'build'
        maybe_makedirs(build_dir)

        if sys.platform == "linux":
@ -94,6 +96,10 @@ def native_build(args):
            CONFIG["USE_CUDA"] = "ON"
            CONFIG["USE_NCCL"] = "ON"
            CONFIG["USE_DLOPEN_NCCL"] = "OFF"
+        elif cli_args.use_hip== 'ON':
+            CONFIG['USE_HIP'] = 'ON'
+            CONFIG['USE_RCCL'] = 'ON'
+            CONFIG["USE_DLOPEN_RCCL"] = "OFF"

        args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]

@ -136,9 +142,9 @@ def native_build(args):
            run(f'"{sys.executable}" mapfeat.py')
            run(f'"{sys.executable}" mknfold.py machine.txt 1')

-    xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
+    xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip== "ON" else "xgboost4j"
    xgboost4j_spark = (
-        "xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
+        "xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "xgboost4j-spark"
    )

    print("copying native library")
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -48,6 +48,7 @@
        <cudf.classifier>cuda12</cudf.classifier>
        <scalatest.version>3.2.18</scalatest.version>
        <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
+        <use.hip>OFF</use.hip>

        <!-- SPARK-36796 for JDK-17 test-->
        <extraJavaTestArgs>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -104,6 +104,8 @@
                                <argument>${log.capi.invocation}</argument>
                                <argument>--use-cuda</argument>
                                <argument>${use.cuda}</argument>
+                                <argument>--use-hip</argument>
+                                <argument>${use.hip}</argument>
                            </arguments>
                            <workingDirectory>${user.dir}</workingDirectory>
                        </configuration>
--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp
@ -2,7 +2,7 @@
 // Created by bobwang on 2021/9/8.
 //

-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 #include <jni.h>

--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
@ -1,6 +1,10 @@
 #include <jni.h>

+#if defined(XGBOOST_USE_CUDA)
 #include "../../../../src/common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../../../../src/common/device_helpers.hip.h"
+#endif
 #include "../../../../src/common/cuda_pinned_allocator.h"
 #include "../../../../src/data/array_interface.h"
 #include "jvm_utils.h"
--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "xgboost4j-gpu.cu"
+#endif
--- a/plugin/federated/CMakeLists.txt
+++ b/plugin/federated/CMakeLists.txt
@ -60,6 +60,10 @@ target_sources(
 if(USE_CUDA)
  target_sources(objxgboost PRIVATE federated_comm.cu federated_coll.cu)
 endif()
+if(USE_HIP)
+  target_sources(objxgboost PRIVATE federated_comm.hip federated_coll.hip)
+endif()
+

 target_link_libraries(objxgboost PRIVATE federated_client "-Wl,--exclude-libs,ALL")
 target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_FEDERATED=1)
--- a/plugin/federated/federated_coll.cc
+++ b/plugin/federated/federated_coll.cc
@ -54,7 +54,7 @@ namespace {
 }
 }  // namespace

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 Coll *FederatedColl::MakeCUDAVar() {
  common::AssertGPUSupport();
  return nullptr;
--- a/plugin/federated/federated_coll.hip
+++ b/plugin/federated/federated_coll.hip
@ -0,0 +1,4 @@
+
+#ifdef XGBOOST_USE_HIP
+#include "federated_coll.cu"
+#endif
--- a/plugin/federated/federated_comm.cc
+++ b/plugin/federated/federated_comm.cc
@ -128,7 +128,7 @@ FederatedComm::FederatedComm(std::int32_t retry, std::chrono::seconds timeout, s
             client_cert);
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 Comm* FederatedComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
  common::AssertGPUSupport();
  return nullptr;
--- a/plugin/federated/federated_comm.hip
+++ b/plugin/federated/federated_comm.hip
@ -0,0 +1,4 @@
+
+#ifdef XGBOOST_USE_HIP
+#include "federated_comm.cu"
+#endif
--- a/python-package/packager/build_config.py
+++ b/python-package/packager/build_config.py
@ -22,6 +22,10 @@ class BuildConfiguration:  # pylint: disable=R0902
    plugin_federated: bool = False
    # Whether to enable rmm support
    plugin_rmm: bool = False
+    # Whether to enable HIP 
+    use_hip: bool = False
+    # Whether to enable RCCL
+    use_rccl: bool = False
    # Special option: See explanation below
    use_system_libxgboost: bool = False

--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@ -257,7 +257,7 @@ class CommunicatorContext:
            return

        binfo = build_info()
-        if not binfo["USE_DLOPEN_NCCL"]:
+        if not binfo["USE_DLOPEN_NCCL"] and not binfo["USE_DLOPEN_RCCL"]:
            return

        try:
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 187e4be94513c71bea1e10a3eded6b9b2da0521f
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -16,6 +16,12 @@ if(USE_CUDA)
  target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
 endif()

+if (USE_HIP)
+  file(GLOB_RECURSE HIP_SOURCES *.cu *.hip.h)
+  target_sources(objxgboost PRIVATE ${HIP_SOURCES})
+  set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP)
+endif (USE_HIP)
+
 if(PLUGIN_SYCL)
  target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_SYCL=1)
 endif()
@ -37,6 +43,7 @@ msvc_use_static_runtime()

 # This grouping organises source files nicely in visual studio
 auto_source_group("${CUDA_SOURCES}")
+auto_source_group("${HIP_SOURCES}")
 auto_source_group("${CPU_SOURCES}")

 #-- End object library
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -67,14 +67,17 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
 static_assert(DMLC_CXX11_THREAD_LOCAL, "XGBoost depends on thread-local storage.");
 using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
  auto &info = *p_info;
  info["USE_CUDA"] = Boolean{false};
  info["USE_NCCL"] = Boolean{false};
+  info["USE_HIP"] = Boolean{false};
+  info["USE_RCCL"] = Boolean{false};
  info["USE_RMM"] = Boolean{false};
  info["USE_DLOPEN_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
 }
 }  // namespace xgboost
 #endif
@ -277,7 +280,7 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
  API_END();
 }

-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *, char const *, DMatrixHandle *) {
  API_BEGIN();
  common::AssertGPUSupport();
@ -1039,7 +1042,7 @@ namespace xgboost {
 // copy user-supplied CUDA gradient arrays
 void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
                                ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 {
  common::AssertGPUSupport();
 }
@ -1300,7 +1303,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
  API_END();
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *,
                                          DMatrixHandle, xgboost::bst_ulong const **,
                                          xgboost::bst_ulong *, const float **) {
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@ -17,13 +17,20 @@
 #include "xgboost/learner.h"
 #if defined(XGBOOST_USE_NCCL)
 #include <nccl.h>
+#elif defined(XGBOOST_USE_RCCL)
+#include <rccl.h>
 #endif

 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
  auto &info = *p_info;

+#if defined(XGBOOST_USE_CUDA)
  info["USE_CUDA"] = true;
+#elif defined(XGBOOST_USE_HIP)
+  info["USE_CUDA"] = true;
+  info["USE_HIP"] = true;
+#endif

  std::vector<Json> v{Json{Integer{THRUST_MAJOR_VERSION}}, Json{Integer{THRUST_MINOR_VERSION}},
                      Json{Integer{THRUST_SUBMINOR_VERSION}}};
@ -36,16 +43,29 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_NCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["NCCL_VERSION"] = v;
-
 #if defined(XGBOOST_USE_DLOPEN_NCCL)
  info["USE_DLOPEN_NCCL"] = Boolean{true};
 #else
  info["USE_DLOPEN_NCCL"] = Boolean{false};
 #endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
-
+#elif defined(XGBOOST_USE_RCCL)
+  info["USE_NCCL"] = Boolean{true};
+  info["USE_RCCL"] = Boolean{true};
+  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
+  info["RCCL_VERSION"] = v;
+  info["NCCL_VERSION"] = v;
+#if defined(XGBOOST_USE_DLOPEN_RCCL)
+  info["USE_DLOPEN_NCCL"] = Boolean{true};
+  info["USE_DLOPEN_RCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_RCCL)
 #else
  info["USE_NCCL"] = Boolean{false};
  info["USE_DLOPEN_NCCL"] = Boolean{false};
+  info["USE_RCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
 #endif

 #if defined(XGBOOST_USE_RMM)
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@ -179,7 +179,7 @@ inline float GetMissing(Json const &config) {

 // Safe guard some global variables from being changed by XGBoost.
 class XGBoostAPIGuard {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  int32_t device_id_ {0};

  void SetGPUAttribute();
--- a/src/collective/coll.cc
+++ b/src/collective/coll.cc
@ -19,6 +19,8 @@

 #if defined(XGBOOST_USE_CUDA)
 #include "cuda_fp16.h"  // for __half
+#elif defined(XGBOOST_USE_HIP)
+#include <hip/hip_fp16.h>  // for __half
 #endif

 namespace xgboost::collective {
@ -26,6 +28,8 @@ template <typename T>
 bool constexpr IsFloatingPointV() {
 #if defined(XGBOOST_USE_CUDA)
  return std::is_floating_point_v<T> || std::is_same_v<T, __half>;
+#elif defined(XGBOOST_USE_HIP) /* hack for HIP/Clang */
+  return std::is_floating_point_v<T> || (sizeof(T) == sizeof(unsigned short));
 #else
  return std::is_floating_point_v<T>;
 #endif  // defined(XGBOOST_USE_CUDA)
@ -136,7 +140,7 @@ bool constexpr IsFloatingPointV() {
  }
 }

-#if !defined(XGBOOST_USE_NCCL)
+#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
 Coll* Coll::MakeCUDAVar() {
  LOG(FATAL) << "NCCL is required for device communication.";
  return nullptr;
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@ -1,7 +1,7 @@
 /**
 * Copyright 2023-2024, XGBoost Contributors
 */
-#if defined(XGBOOST_USE_NCCL)
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include <cstdint>  // for int8_t, int64_t

 #include "../common/device_helpers.cuh"
@ -9,7 +9,11 @@
 #include "allgather.h"  // for AllgatherVOffset
 #include "coll.cuh"
 #include "comm.cuh"
+#if defined(XGBOOST_USE_NCCL)
 #include "nccl.h"
+#elif defined(XGBOOST_USE_RCCL)
+#include "rccl.h"
+#endif
 #include "xgboost/collective/result.h"  // for Result
 #include "xgboost/span.h"               // for Span

--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@ -214,7 +214,7 @@ RabitComm::RabitComm(std::string const& tracker_host, std::int32_t tracker_port,
  }
 }

-#if !defined(XGBOOST_USE_NCCL)
+#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
 Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
  common::AssertGPUSupport();
  common::AssertNCCLSupport();
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@ -1,7 +1,7 @@
 /**
 * Copyright 2023, XGBoost Contributors
 */
-#if defined(XGBOOST_USE_NCCL)
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include <algorithm>  // for sort
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint64_t, int8_t
@ -39,12 +39,22 @@ Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared
 }

 inline constexpr std::size_t kUuidLength =
-    sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t);
+#if defined(XGBOOST_USE_CUDA)
+  sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t);
+#elif defined(XGBOOST_USE_HIP)
+  sizeof(hipUUID) / sizeof(uint64_t);
+#endif

 void GetCudaUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid, DeviceOrd device) {
+#if defined(XGBOOST_USE_CUDA)
  cudaDeviceProp prob{};
  dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal));
-  std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid));
+  std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
+#elif defined(XGBOOST_USE_HIP)
+  hipUUID id;
+  hipDeviceGetUuid(&id, device.ordinal);
+  std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
+#endif
 }

 static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid) {
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@ -3,8 +3,11 @@
 */
 #pragma once

-#ifdef XGBOOST_USE_NCCL
+#if defined(XGBOOST_USE_NCCL)
 #include "nccl.h"
+#elif defined(XGBOOST_USE_RCCL)
+#include "../common/cuda_to_hip.h"
+#include "rccl.h"
 #endif  // XGBOOST_USE_NCCL

 #include <utility>  // for move
@ -25,7 +28,7 @@ inline Result GetCUDAResult(cudaError rc) {
  return Fail(msg);
 }

-#if defined(XGBOOST_USE_NCCL)
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 class NCCLComm : public Comm {
  ncclComm_t nccl_comm_{nullptr};
  std::shared_ptr<NcclStub> stub_;
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@ -54,7 +54,7 @@ void Communicator::Init(Json const& config) {
  }
 }

-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void Communicator::Finalize() {
  communicator_->Shutdown();
  communicator_.reset(new NoOpCommunicator());
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@ -5,7 +5,7 @@
 #include "device_communicator.cuh"
 #include "device_communicator_adapter.cuh"
 #include "noop_communicator.h"
-#ifdef XGBOOST_USE_NCCL
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include "nccl_device_communicator.cuh"
 #endif

@ -28,7 +28,7 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
      communicator_->GetWorldSize() != old_world_size) {
    old_device_ordinal = device_ordinal;
    old_world_size = communicator_->GetWorldSize();
-#ifdef XGBOOST_USE_NCCL
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    switch (type_) {
      case CommunicatorType::kRabit:
        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@ -98,7 +98,7 @@ class Communicator {
  /** @brief Get the communicator instance. */
  static Communicator *Get() { return communicator_.get(); }

-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  /**
   * @brief Get the device communicator.
   *
@ -235,7 +235,7 @@ class Communicator {
  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
  static thread_local std::string nccl_path_;
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif

--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@ -74,6 +74,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
      Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }
+
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                             cudaMemcpyDefault));
  }
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@ -1,7 +1,7 @@
 /*!
 * Copyright 2023 XGBoost contributors
 */
-#if defined(XGBOOST_USE_NCCL)
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include <numeric>  // for accumulate

 #include "comm.cuh"
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@ -38,12 +38,22 @@ class NcclDeviceCommunicator : public DeviceCommunicator {

 private:
  static constexpr std::size_t kUuidLength =
+#if defined(XGBOOST_USE_CUDA)
      sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
+#elif defined(XGBOOST_USE_HIP)
+      sizeof(hipUUID) / sizeof(uint64_t);
+#endif

  void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
+#if defined(XGBOOST_USE_CUDA)
    cudaDeviceProp prob{};
    dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
    std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
+#elif defined(XGBOOST_USE_HIP)
+    hipUUID id;
+    hipDeviceGetUuid(&id, device_ordinal_);
+    std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
+#endif
  }

  static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) {
--- a/src/collective/nccl_stub.cc
+++ b/src/collective/nccl_stub.cc
@ -1,15 +1,24 @@
 /**
 * Copyright 2023, XGBoost Contributors
 */
-#if defined(XGBOOST_USE_NCCL)
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include "nccl_stub.h"

+#if defined(XGBOOST_USE_NCCL)
 #include <cuda.h>              // for CUDA_VERSION
 #include <cuda_runtime_api.h>  // for cudaPeekAtLastError
 #include <dlfcn.h>             // for dlclose, dlsym, dlopen
 #include <nccl.h>
 #include <thrust/system/cuda/error.h>  // for cuda_category
 #include <thrust/system_error.h>       // for system_error
+#elif defined(XGBOOST_USE_RCCL)
+#include "../common/cuda_to_hip.h"
+#include <hip/hip_runtime_api.h>  // for cudaPeekAtLastError
+#include <dlfcn.h>             // for dlclose, dlsym, dlopen
+#include <rccl.h>
+#include <thrust/system/hip/error.h>  // for cuda_category
+#include <thrust/system_error.h>       // for system_error
+#endif

 #include <cstdint>  // for int32_t
 #include <sstream>  // for stringstream
@ -39,7 +48,7 @@ Result NcclStub::GetNcclResult(ncclResult_t code) const {
 }

 NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
-#if defined(XGBOOST_USE_DLOPEN_NCCL)
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
  CHECK(!path_.empty()) << "Empty path for NCCL.";

  auto cu_major = (CUDA_VERSION) / 1000;
@ -117,7 +126,7 @@ no long bundles NCCL in the binary wheel.
 };

 NcclStub::~NcclStub() {  // NOLINT
-#if defined(XGBOOST_USE_DLOPEN_NCCL)
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
  if (handle_) {
    auto rc = dlclose(handle_);
    if (rc != 0) {
--- a/src/collective/nccl_stub.h
+++ b/src/collective/nccl_stub.h
@ -2,9 +2,21 @@
 * Copyright 2023, XGBoost Contributors
 */
 #pragma once
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
+
 #if defined(XGBOOST_USE_NCCL)
 #include <cuda_runtime_api.h>
 #include <nccl.h>
+#elif defined(XGBOOST_USE_RCCL)
+#include "../common/cuda_to_hip.h"
+
+#ifndef THRUST_DEVICE_SYSTEM
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP
+#endif
+
+#include <hip/hip_runtime_api.h>
+#include <rccl.h>
+#endif

 #include <string>  // for string

@ -16,7 +28,7 @@ namespace xgboost::collective {
 * @brief A stub for NCCL to facilitate dynamic loading.
 */
 class NcclStub {
-#if defined(XGBOOST_USE_DLOPEN_NCCL)
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
  void* handle_{nullptr};
 #endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
  std::string path_;
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@ -10,7 +10,13 @@

 #include <cstddef>             // size_t
 #include <cstdint>             // int32_t
+
+#if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>         // DispatchSegmentedRadixSort,NullType,DoubleBuffer
+#elif defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
+#endif
+
 #include <iterator>            // distance
 #include <limits>              // numeric_limits
 #include <type_traits>         // conditional_t,remove_const_t
@ -25,6 +31,7 @@

 namespace xgboost::common {
 namespace detail {
+
 // Wrapper around cub sort to define is_decending
 template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT,
          typename EndOffsetIteratorT>
@ -38,6 +45,7 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
  using OffsetT = int;

  // Null value type
+#if defined(XGBOOST_USE_CUDA)
  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
  cub::DoubleBuffer<cub::NullType> d_values;

@ -46,6 +54,20 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
                 OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
                                    num_segments, d_begin_offsets, d_end_offsets, begin_bit,
                                    end_bit, false, ctx->Stream(), debug_synchronous)));
+#elif defined(XGBOOST_USE_HIP)
+  if (IS_DESCENDING) {
+      rocprim::segmented_radix_sort_pairs_desc<KeyT, cub::NullType, BeginOffsetIteratorT>(d_temp_storage,
+              temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
+              num_segments, d_begin_offsets, d_end_offsets,
+              begin_bit, end_bit, ctx->Stream(), debug_synchronous);
+  }
+  else {
+      rocprim::segmented_radix_sort_pairs<KeyT, cub::NullType, BeginOffsetIteratorT>(d_temp_storage,
+              temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
+              num_segments, d_begin_offsets, d_end_offsets,
+              begin_bit, end_bit, ctx->Stream(), debug_synchronous);
+  }
+#endif
 }

 // Wrapper around cub sort for easier `descending` sort.
@ -59,14 +81,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
                                  int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
+#if defined(XGBOOST_USE_CUDA)
  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
+#endif
+
  // In old version of cub, num_items in dispatch is also int32_t, no way to change.
  using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
                                     std::size_t, std::int32_t>;
  CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
  // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation

+#if defined(XGBOOST_USE_CUDA)
 #if THRUST_MAJOR_VERSION >= 2
  dh::safe_cuda((cub::DispatchSegmentedRadixSort<
                 descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
@ -87,6 +113,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
                                                          d_begin_offsets, d_end_offsets, begin_bit,
                                                          end_bit, false, stream, false)));
 #endif
+#elif defined(XGBOOST_USE_HIP)
+  if (descending) {
+      rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+              d_values_in, d_values_out, num_items, num_segments,
+              d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream, false);
+  }
+  else {
+      rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+              d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets,
+              begin_bit, end_bit, stream, false);
+  }
+#endif
 }
 }  // namespace detail

@ -188,6 +226,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                             });
 }

+#if defined(XGBOOST_USE_CUDA)
 template <bool accending, typename IdxT, typename U>
 void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
             xgboost::common::Span<IdxT> sorted_idx) {
@ -257,5 +296,51 @@ void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
                                cuctx->Stream()));
 }
+#elif defined(XGBOOST_USE_HIP)
+template <bool accending, typename IdxT, typename U>
+void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
+             xgboost::common::Span<IdxT> sorted_idx) {
+  std::size_t bytes = 0;
+  auto cuctx = ctx->CUDACtx();
+  dh::Iota(sorted_idx, cuctx->Stream());
+
+  using KeyT = typename decltype(keys)::value_type;
+  using ValueT = std::remove_const_t<IdxT>;
+
+  dh::TemporaryArray<KeyT> out(keys.size());
+  dh::TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
+
+  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
+  using OffsetT = std::conditional_t<!dh::BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
+  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
+  if (accending) {
+    void *d_temp_storage = nullptr;
+
+    dh::safe_cuda((rocprim::radix_sort_pairs(d_temp_storage,
+                    bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
+                    sizeof(KeyT) * 8, cuctx->Stream(), false)));
+
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+    dh::safe_cuda((rocprim::radix_sort_pairs(d_temp_storage,
+                    bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
+                    sizeof(KeyT) * 8, cuctx->Stream(), false)));
+  } else {
+    void *d_temp_storage = nullptr;
+
+    dh::safe_cuda((rocprim::radix_sort_pairs_desc(d_temp_storage,
+                    bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
+                    sizeof(KeyT) * 8, cuctx->Stream(), false)));
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+    dh::safe_cuda((rocprim::radix_sort_pairs_desc(d_temp_storage,
+                    bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
+                    sizeof(KeyT) * 8, cuctx->Stream(), false)));
+  }
+
+  dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                            sorted_idx.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream()));
+}
+#endif
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@ -16,14 +16,18 @@
 #include <thrust/device_ptr.h>

 #include "device_helpers.cuh"
-#endif  // defined(__CUDACC__)
+#elif defined(__HIPCC__)
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include "device_helpers.hip.h"
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)

 #include "common.h"
 #include "xgboost/span.h"  // for Span

 namespace xgboost {

-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 using BitFieldAtomicType = unsigned long long;  // NOLINT

 __forceinline__ __device__ BitFieldAtomicType AtomicOr(BitFieldAtomicType* address,
@ -47,7 +51,7 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr

  return old;
 }
-#endif  // defined(__CUDACC__)
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)

 /**
 * @brief A non-owning type with auxiliary methods defined for manipulating bits.
@ -105,7 +109,7 @@ struct BitFieldContainer {
  XGBOOST_DEVICE static size_t ComputeStorageSize(index_type size) {
    return common::DivRoundUp(size, kValueSize);
  }
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t min_size = min(NumValues(), rhs.NumValues());
@ -122,9 +126,9 @@ struct BitFieldContainer {
    }
    return *this;
  }
-#endif  // #if defined(__CUDA_ARCH__)
+#endif  // #if defined(__CUDA_ARCH__) || defined(__HIPCC__)

-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
    size_t min_size = min(NumValues(), rhs.NumValues());
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -143,7 +147,7 @@ struct BitFieldContainer {
  }
 #endif  // defined(__CUDA_ARCH__)

-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  __device__ auto Set(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
    value_type& value = Data()[pos_v.int_pos];
@ -158,6 +162,16 @@ struct BitFieldContainer {
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
  }
+
+  /* compiler hack */
+#if defined(__HIPCC__)
+  void Clear(index_type pos) noexcept(true) {
+    Pos pos_v = Direction::Shift(ToBitPos(pos));
+    value_type& value = Data()[pos_v.int_pos];
+    value_type clear_bit = ~(kOne << pos_v.bit_pos);
+    value &= clear_bit;
+  }
+#endif
 #else
  void Set(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
@ -171,7 +185,7 @@ struct BitFieldContainer {
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    value &= clear_bit;
  }
-#endif  // defined(__CUDA_ARCH__)
+#endif  // defined(__CUDA_ARCH__) || defined(__HIPCC__)

  XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
    pos_v = Direction::Shift(pos_v);
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@ -173,7 +173,7 @@ class ColumnMatrix {
      this->InitView();
    }
    /** @brief Set the i^th element to be a valid element (instead of missing). */
-    void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
+    void SetValid(typename LBitField32::index_type i) {missing.Clear(i);}
    /** @brief assign the storage to the view. */
    void InitView() {
      missing = LBitField32{Span{storage.data(), static_cast<size_t>(storage.size())}};
--- a/src/common/common.cc
+++ b/src/common/common.cc
@ -55,7 +55,7 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
  }
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 int AllVisibleGPUs() { return 0; }
 #endif  // !defined(XGBOOST_USE_CUDA)

--- a/src/common/common.cu
+++ b/src/common/common.cu
@ -2,6 +2,7 @@
 * Copyright 2018-2022 XGBoost contributors
 */
 #include "common.h"
+#include "cuda_to_hip.h"

 namespace xgboost {
 namespace common {
--- a/src/common/common.h
+++ b/src/common/common.h
@ -25,6 +25,13 @@

 #define WITH_CUDA() true

+#elif defined(__HIPCC__)
+#include "cuda_to_hip.h"
+#include <thrust/system/hip/error.h>
+#include <thrust/system_error.h>
+
+#define WITH_CUDA() true
+
 #else

 #define WITH_CUDA() false
@ -32,14 +39,14 @@
 #endif  // defined(__CUDACC__)

 namespace dh {
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 /*
 * Error handling  functions
 */
 #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)

-inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                    int line) {
+inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, int line)
+{
  if (code != cudaSuccess) {
    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
                                       std::string{file} + ": " +  // NOLINT
@ -47,7 +54,7 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
  }
  return code;
 }
-#endif  // defined(__CUDACC__)
+#endif
 }  // namespace dh

 namespace xgboost::common {
@ -170,13 +177,13 @@ class Range {
 int AllVisibleGPUs();

 inline void AssertGPUSupport() {
-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
    LOG(FATAL) << "XGBoost version not compiled with GPU support.";
-#endif  // XGBOOST_USE_CUDA
+#endif  // XGBOOST_USE_CUDA && XGBOOST_USE_HIP
 }

 inline void AssertNCCLSupport() {
-#if !defined(XGBOOST_USE_NCCL)
+#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
    LOG(FATAL) << "XGBoost version not compiled with NCCL support.";
 #endif  // !defined(XGBOOST_USE_NCCL)
 }
@ -189,7 +196,7 @@ inline void AssertSYCLSupport() {

 void SetDevice(std::int32_t device);

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline void SetDevice(std::int32_t device) {
  if (device >= 0) {
    AssertGPUSupport();
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@ -11,9 +11,11 @@

 #include "common.h"

-#ifdef __CUDACC__
+#if defined(__CUDACC__)
 #include "device_helpers.cuh"
-#endif  // __CUDACC__
+#elif defined(__HIPCC__)
+#include "device_helpers.hip.h"
+#endif  // __CUDACC__ || __HIPCC__

 namespace xgboost {
 namespace common {
@ -105,7 +107,7 @@ class CompressedBufferWriter {
    }
  }

-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
  __device__ void AtomicWriteSymbol
    (CompressedByteT* buffer, uint64_t symbol, size_t offset) {
    size_t ibit_start = offset * symbol_bits_;
@ -119,7 +121,7 @@ class CompressedBufferWriter {
      symbol >>= 8;
    }
  }
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__

  template <typename IterT>
  void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@ -4,7 +4,6 @@
 #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #include <thrust/execution_policy.h>
-
 #include "device_helpers.cuh"

 namespace xgboost {
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@ -76,7 +76,9 @@ class pinned_allocator {
    return result;
  }

-  inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT
+  inline void deallocate(pointer p, size_type) {
+      dh::safe_cuda(cudaFreeHost(p));
+  } // NOLINT

  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT

--- a/src/common/cuda_to_hip.h
+++ b/src/common/cuda_to_hip.h
@ -0,0 +1,85 @@
+/**
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#pragma once
+
+#if defined(XGBOOST_USE_HIP)
+
+#define cudaSuccess                                         hipSuccess
+#define cudaError                                           hipError_t
+#define cudaError_t                                         hipError_t
+#define cudaGetLastError                                    hipGetLastError
+#define cudaPeekAtLastError                                 hipPeekAtLastError
+#define cudaErrorInvalidValue                               hipErrorInvalidValue
+
+#define cudaStream_t                                        hipStream_t
+#define cudaStreamCreate                                    hipStreamCreate
+#define cudaStreamCreateWithFlags                           hipStreamCreateWithFlags
+#define cudaStreamDestroy                                   hipStreamDestroy
+#define cudaStreamWaitEvent                                 hipStreamWaitEvent
+#define cudaStreamSynchronize                               hipStreamSynchronize
+#define cudaStreamPerThread                                 hipStreamPerThread
+
+/* not compatible */
+#define cudaStreamLegacy                                    hipStreamDefault
+#define hipStreamLegacy                                     hipStreamDefault
+
+#define cudaEvent_t                                         hipEvent_t
+#define cudaEventCreate                                     hipEventCreate
+#define cudaEventCreateWithFlags                            hipEventCreateWithFlags
+#define cudaEventDestroy                                    hipEventDestroy
+
+#define cudaGetDevice                                       hipGetDevice
+#define cudaSetDevice                                       hipSetDevice
+#define cudaGetDeviceCount                                  hipGetDeviceCount
+#define cudaDeviceSynchronize                               hipDeviceSynchronize
+
+#define cudaGetDeviceProperties                             hipGetDeviceProperties
+#define cudaDeviceGetAttribute                              hipDeviceGetAttribute
+
+#define cudaMallocHost                                      hipHostMalloc
+#define cudaFreeHost                                        hipHostFree
+#define cudaMalloc                                          hipMalloc
+#define cudaFree                                            hipFree
+
+#define cudaMemcpy                                          hipMemcpy
+#define cudaMemcpyAsync                                     hipMemcpyAsync
+#define cudaMemcpyDefault                                   hipMemcpyDefault
+#define cudaMemcpyHostToDevice                              hipMemcpyHostToDevice
+#define cudaMemcpyHostToHost                                hipMemcpyHostToHost
+#define cudaMemcpyDeviceToHost                              hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice                            hipMemcpyDeviceToDevice
+#define cudaMemsetAsync                                     hipMemsetAsync
+#define cudaMemset                                          hipMemset
+
+#define cudaPointerAttributes                               hipPointerAttribute_t 
+#define cudaPointerGetAttributes                            hipPointerGetAttributes
+
+/* hipMemoryTypeUnregistered not supported */
+#define cudaMemoryTypeUnregistered                          hipMemoryTypeUnified
+#define cudaMemoryTypeUnified                               hipMemoryTypeUnified
+#define cudaMemoryTypeHost                                  hipMemoryTypeHost
+
+#define cudaMemGetInfo                                      hipMemGetInfo
+#define cudaFuncSetAttribute                                hipFuncSetAttribute
+
+#define cudaDevAttrMultiProcessorCount                      hipDeviceAttributeMultiprocessorCount
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor       hipOccupancyMaxActiveBlocksPerMultiprocessor
+
+namespace thrust {
+    namespace hip {
+    }
+
+    namespace cuda = thrust::hip;
+}
+
+namespace thrust {
+#define cuda_category hip_category
+}
+
+namespace hipcub {
+}
+
+namespace cub = hipcub;
+
+#endif
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -2,6 +2,8 @@
 * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
+
+#if defined(XGBOOST_USE_CUDA)
 #include <thrust/binary_search.h>  // thrust::upper_bound
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
@ -1138,3 +1140,7 @@ class LDGIterator {
  }
 };
 }  // namespace dh
+
+#elif defined(XGBOOST_USE_HIP)
+#include "device_helpers.hip.h"
+#endif
--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
--- a/src/common/gpu_error_check.h
+++ b/src/common/gpu_error_check.h
@ -0,0 +1,15 @@
+#pragma once
+#include <iostream>
+#include <hip/hip_runtime.h>
+
+#define GPU_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(hipError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != hipSuccess) 
+   {
+      std::cerr << "GPU Error: " << hipGetErrorString(code) << " " << file << " " << line << std::endl;
+      if (abort) exit(code);
+   }
+}
+
+#define GPU_CHECK_LAST() { gpuAssert(hipGetLastError(), __FILE__, __LINE__); }
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -171,6 +171,7 @@ void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cu
      column_sizes_scan->begin(), [=] __device__(size_t column_size) {
        return thrust::min(num_cuts_per_feature, column_size);
      });
+
  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
@ -296,6 +297,7 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
                                 &column_sizes_scan,
                                 &sorted_entries);
  dh::XGBDeviceAllocator<char> alloc;
+
  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
               sorted_entries.end(), detail::EntryCompareOp());

@ -355,11 +357,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
          bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
          return weights[group_idx];
        });
+
    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
                                 weight_iter + begin, weight_iter + end,
                                 batch_iter + begin,
                                 d_temp_weights.data(),  // output
                                 is_valid);
+
    CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
  } else {
    CHECK_EQ(batch.NumRows(), weights.size());
@ -368,11 +372,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
        [=]__device__(size_t idx) -> float {
          return weights[batch.GetElement(idx).row_idx];
        });
+
    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
                                 weight_iter + begin, weight_iter + end,
                                 batch_iter + begin,
                                 d_temp_weights.data(),  // output
                                 is_valid);
+
    CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
  }

--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@ -1,7 +1,7 @@
 /**
 * Copyright 2017-2023 by XGBoost contributors
 */
-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 // dummy implementation of HostDeviceVector in case CUDA is not used

@ -197,4 +197,4 @@ template class HostDeviceVector<std::size_t>;

 }  // namespace xgboost

-#endif  // XGBOOST_USE_CUDA
+#endif  // XGBOOST_USE_CUDA && !defined(XGBOOST_USE_HIP)
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -195,6 +195,7 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
    SetDevice();
+
    dh::safe_cuda(cudaMemcpy(data_h_.data(),
                             data_d_->data().get(),
                             data_d_->size() * sizeof(T),
@ -211,6 +212,7 @@ class HostDeviceVectorImpl {
    // data is on the host
    LazyResizeDevice(data_h_.size());
    SetDevice();
+
    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
                                  data_h_.data(),
                                  data_d_->size() * sizeof(T),
@ -239,6 +241,7 @@ class HostDeviceVectorImpl {
      LazyResizeDevice(Size());
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
+
      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
    }
@ -248,6 +251,7 @@ class HostDeviceVectorImpl {
    LazyResizeDevice(Size());
    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
+
    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
  }
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@ -48,7 +48,7 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, F
  }
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
  common::AssertGPUSupport();
@ -66,7 +66,7 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
  }
  ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 template <typename T, std::int32_t kDim>
 auto cbegin(TensorView<T, kDim> const& v) {  // NOLINT
--- a/src/common/math.h
+++ b/src/common/math.h
@ -143,7 +143,7 @@ CheckNAN(T) {
  return false;
 }

-#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
+#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__) && !defined(__HIPCC__)

 bool CheckNAN(double v);

@ -152,23 +152,27 @@ bool CheckNAN(double v);
 XGBOOST_DEVICE bool inline CheckNAN(float x) {
 #if defined(__CUDA_ARCH__)
  return isnan(x);
+#elif defined(__HIPCC__)
+  return __builtin_isnan(x);
 #else
  return std::isnan(x);
-#endif  // defined(__CUDA_ARCH__)
+#endif  // defined(__CUDA_ARCH__) || defined(__HIPCC__)
 }

 XGBOOST_DEVICE bool inline CheckNAN(double x) {
 #if defined(__CUDA_ARCH__)
  return isnan(x);
+#elif defined(__HIPCC__)
+  return __builtin_isnan(x);
 #else
  return std::isnan(x);
-#endif  // defined(__CUDA_ARCH__)
+#endif  // defined(__CUDA_ARCH__) || defined(__HIPCC__)
 }

 #endif  // XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
 // GPU version is not uploaded in CRAN anyway.
 // Specialize only when using R with CPU.
-#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA)
+#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 double LogGamma(double v);

 #else  // Not R or R with GPU.
@ -191,7 +195,7 @@ XGBOOST_DEVICE inline T LogGamma(T v) {
 #endif  // _MSC_VER
 }

-#endif  // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA)
+#endif  // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 }  // namespace common
 }  // namespace xgboost
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@ -13,6 +13,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
  values.SetDevice(ctx->Device());
  auto const d_values = values.ConstDeviceSpan();
  dh::XGBCachingDeviceAllocator<char> alloc;
+
  return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
                    thrust::plus<float>{});
 }
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@ -99,12 +99,12 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {

 namespace cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline double Reduce(Context const*, HostDeviceVector<float> const&) {
  AssertGPUSupport();
  return 0;
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl

 /**
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@ -107,6 +107,7 @@ template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
  CHECK_EQ(out.size(), src.size());
  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
+
  dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                cudaMemcpyDefault));
@ -160,6 +161,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
  // Compute output ptr
  auto transform_it =
      thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
+
  thrust::transform(
      thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
      out_ptr.data(),
@ -376,6 +378,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
      });
  // Reverse scan to accumulate weights into first duplicated element on left.
  auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
+
  thrust::inclusive_scan_by_key(
      thrust::cuda::par(alloc), key_it, key_it + entries.size(),
      val_it, val_it,
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -184,11 +184,13 @@ class SketchContainer {
    dh::XGBCachingDeviceAllocator<char> alloc;

    d_column_scan = this->columns_ptr_.DeviceSpan();
+
    size_t n_uniques = dh::SegmentedUnique(
        thrust::cuda::par(alloc), d_column_scan.data(),
        d_column_scan.data() + d_column_scan.size(), entries.data(),
        entries.data() + entries.size(), scan_out.DevicePointer(),
        entries.data(), detail::SketchUnique{}, key_comp);
+
    this->columns_ptr_.Copy(scan_out);
    CHECK(!this->columns_ptr_.HostCanRead());

--- a/src/common/random.cc
+++ b/src/common/random.cc
@ -19,7 +19,7 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();

  if (ctx_->IsCUDA()) {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    cuda_impl::SampleFeature(ctx_, n, p_features, p_new_features, this->feature_weights_,
                             &this->weight_buffer_, &this->idx_buffer_, &rng_);
    return p_new_features;
--- a/src/common/random.h
+++ b/src/common/random.h
@ -180,7 +180,7 @@ class ColumnSampler {
    if (ctx->IsCPU()) {
      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
    } else {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
      cuda_impl::InitFeatureSet(ctx, feature_set_tree_);
 #else
      AssertGPUSupport();
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@ -62,7 +62,7 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
  return rank;
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
                                                             common::Span<float const>) {
@ -108,9 +108,9 @@ void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
  });
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 DMLC_REGISTER_PARAMETER(LambdaRankParam);

@ -120,7 +120,7 @@ void PreCache::InitOnCPU(Context const*, MetaInfo const& info) {
                 [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 #endif  // !defined(XGBOOST_USE_CUDA)

@ -130,9 +130,9 @@ void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
                 [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
  std::string out_name;
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@ -23,6 +23,10 @@
 #include "xgboost/logging.h"                    // for CHECK
 #include "xgboost/span.h"                       // for Span

+#if defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
+#endif
+
 namespace xgboost::ltr {
 namespace cuda_impl {
 void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@ -216,6 +216,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                                                         detail::SegOp<SegIt>{seg_beg, seg_end});
  auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                                   detail::WeightOp<WIter>{w_begin, d_sorted_idx});
+
  thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
                                scan_val, weights_cdf.begin());

--- a/src/common/stats.h
+++ b/src/common/stats.h
@ -112,7 +112,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWe

 void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights,
                   linalg::Tensor<float, 1>*) {
  common::AssertGPUSupport();
@ -120,7 +120,7 @@ inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalW
 inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
  common::AssertGPUSupport();
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl

 /**
--- a/src/common/survival_util.h
+++ b/src/common/survival_util.h
@ -25,12 +25,12 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::common::ProbabilityDistributionType);
 namespace xgboost {
 namespace common {

-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)

 using std::log;
 using std::fmax;

-#endif  // __CUDACC__
+#endif  // __CUDACC__ && __HIPCC__

 enum class CensoringType : uint8_t {
  kUncensored, kRightCensored, kLeftCensored, kIntervalCensored
--- a/src/common/threading_utils.cuh
+++ b/src/common/threading_utils.cuh
@ -59,9 +59,11 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
  });
  dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
                   out_group_threads_ptr.size());
-  std::size_t total = 0;
+  size_t total = 0;
+
  dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
-                           sizeof(total), cudaMemcpyDeviceToHost));
+      sizeof(total), cudaMemcpyDeviceToHost));
+
  return total;
 }

--- a/src/common/transform.h
+++ b/src/common/transform.h
@ -19,7 +19,9 @@

 #if defined (__CUDACC__)
 #include "device_helpers.cuh"
-#endif  // defined (__CUDACC__)
+#elif defined(__HIPCC__)
+#include "device_helpers.hip.h"
+#endif  // defined (__CUDACC__) || defined(__HIPCC__)

 namespace xgboost {
 namespace common {
@ -28,7 +30,7 @@ constexpr size_t kBlockThreads = 256;

 namespace detail {

-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 template <typename Functor, typename... SpanType>
 __global__ void LaunchCUDAKernel(Functor _func, Range _range,
                                 SpanType... _spans) {
@ -36,7 +38,7 @@ __global__ void LaunchCUDAKernel(Functor _func, Range _range,
    _func(i, _spans...);
  }
 }
-#endif  // defined(__CUDACC__)
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)

 }  // namespace detail

@ -127,7 +129,7 @@ class Transform {
      UnpackShard(device, _vectors...);
    }

-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
    template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
              typename... HDV>
    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
@ -159,7 +161,7 @@ class Transform {

      LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA();
    }
-#endif  // defined(__CUDACC__)
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)

    template <typename... HDV>
    void LaunchCPU(Functor func, HDV *...vectors) const {
--- a/src/context.cc
+++ b/src/context.cc
@ -27,7 +27,7 @@ Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
 namespace {
 inline constexpr char const* kDevice = "device";

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
  device = DeviceOrd::CPU();
  return device;
@ -271,10 +271,10 @@ std::int32_t Context::Threads() const {
  return n_threads;
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 CUDAContext const* Context::CUDACtx() const {
  common::AssertGPUSupport();
  return nullptr;
 }
-#endif  // defined(XGBOOST_USE_CUDA)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 }  // namespace xgboost
--- a/src/data/array_interface.cc
+++ b/src/data/array_interface.cc
@ -6,7 +6,7 @@
 #include "../common/common.h"  // for AssertGPUSupport

 namespace xgboost {
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
 bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
 #endif  // !defined(XGBOOST_USE_CUDA)
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@ -1,31 +1,30 @@
 /** 
 * Copyright 2021-2023, XGBoost Contributors 
 */
-#include <cstdint>  // for int64_t

+#include <cstdint> // for int64_t
 #include "../common/common.h"
-#include "../common/device_helpers.cuh"  // for DefaultStream, CUDAEvent
+#include "../common/device_helpers.cuh" // for DefaultStream, CUDAEvent
 #include "array_interface.h"
 #include "xgboost/logging.h"

 namespace xgboost {
+
 void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
  switch (stream) {
    case 0:
      /**
-       * disallowed by the `__cuda_array_interface__`.  Quote:
+       * disallowed by the *`__cuda_array_interface__`*. Quote:
       *
-       *   This is disallowed as it would be ambiguous between None and the default
-       *   stream, and also between the legacy and per-thread default streams. Any use
-       *   case where 0 might be given should either use None, 1, or 2 instead for
-       *   clarity.
+       * This is disallowed as it would be ambiguous between None and the default
+       * stream, and also between the legacy and per-thread default streams. Any use
+       * case where 0 might be given should either use None, 1, or 2 instead for
+       * clarity.
       */
      LOG(FATAL) << "Invalid stream ID in array interface: " << stream;
-    case 1:
-      // default legacy stream
+    case 1: // default legacy stream
      break;
-    case 2:
-      // default per-thread stream
+    case 2: // default per-thread stream
    default: {
      dh::CUDAEvent e;
      e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
@ -38,6 +37,35 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
  if (!ptr) {
    return false;
  }
+
+  // Check if the pointer is within the process's address space
+  uintptr_t ptr_value = reinterpret_cast<uintptr_t>(ptr);
+  uintptr_t process_max_addr = (uintptr_t)-1;
+
+#if defined(XGBOOST_USE_HIP)
+  hipPointerAttribute_t attr;
+  auto err = hipPointerGetAttributes(&attr, ptr);
+  
+
+  if (err == hipErrorInvalidValue) {
+    return false;
+  } else if (err == hipSuccess) {
+    switch (attr.type) {
+      case hipMemoryTypeUnregistered:
+        return false;
+      case hipMemoryTypeHost:
+        return false;
+      case hipMemoryTypeDevice:
+        return true;
+      case hipMemoryTypeManaged:
+        return true;
+      default:
+        return false;
+    }
+  } else {
+    return false;
+  }
+#elif defined(XGBOOST_USE_CUDA)
  cudaPointerAttributes attr;
  auto err = cudaPointerGetAttributes(&attr, ptr);
  // reset error
@ -59,5 +87,9 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
    // other errors, `cudaErrorNoDevice`, `cudaErrorInsufficientDriver` etc.
    return false;
  }
+#else
+  return false;
+#endif
 }
+
 }  // namespace xgboost
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@ -28,6 +28,8 @@

 #if defined(XGBOOST_USE_CUDA)
 #include "cuda_fp16.h"
+#elif defined(XGBOOST_USE_HIP)
+#include <hip/hip_fp16.h>
 #endif

 namespace xgboost {
@ -321,7 +323,7 @@ class ArrayInterfaceHandler {
 template <typename T, typename E = void>
 struct ToDType;
 // float
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 template <>
 struct ToDType<__half> {
  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
@ -471,7 +473,7 @@ class ArrayInterface {
      CHECK(sizeof(long double) == 16) << error::NoF128();
      type = T::kF16;
    } else if (typestr[1] == 'f' && typestr[2] == '2') {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
      type = T::kF2;
 #else
      LOG(FATAL) << "Half type is not supported.";
@ -510,7 +512,7 @@ class ArrayInterface {
    using T = ArrayInterfaceHandler::Type;
    switch (type) {
      case T::kF2: {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
        return func(reinterpret_cast<__half const *>(data));
 #endif  // defined(XGBOOST_USE_CUDA)
      }
@ -518,7 +520,7 @@ class ArrayInterface {
        return func(reinterpret_cast<float const *>(data));
      case T::kF8:
        return func(reinterpret_cast<double const *>(data));
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__ ) || defined(__HIPCC__)
      case T::kF16: {
        // CUDA device code doesn't support long double.
        SPAN_CHECK(false);
@ -565,7 +567,7 @@ class ArrayInterface {
    static_assert(sizeof...(index) <= D, "Invalid index.");
    return this->DispatchCall([=](auto const *p_values) -> T {
      std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
      // No operator defined for half -> size_t
      using Type = std::conditional_t<
          std::is_same<__half,
@ -599,7 +601,7 @@ template <typename Fn>
 auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
  switch (dtype) {
    case ArrayInterfaceHandler::kF2: {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
      return dispatch(__half{});
 #else
      LOG(FATAL) << "half type is only supported for CUDA input.";
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -799,9 +799,9 @@ void MetaInfo::Validate(DeviceOrd device) const {
  }
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void MetaInfo::SetInfoFromCUDA(Context const&, StringView, Json) { common::AssertGPUSupport(); }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 bool MetaInfo::IsVerticalFederated() const {
  return collective::IsFederated() && IsColumnSplit();
--- a/src/data/data.cu
+++ b/src/data/data.cu
@ -44,6 +44,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
      std::copy(array.shape, array.shape + D, shape.data());
      // set data
      data->Resize(array.n);
+
      dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
                                    cudaMemcpyDefault, ctx->Stream()));
    });
@ -95,8 +96,10 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
    }
  });
  bool non_dec = true;
+
  dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
                           cudaMemcpyDeviceToHost));
+
  CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
  size_t bytes = 0;
  dh::caching_device_vector<uint32_t> out(array_interface.Shape(0));
@ -114,8 +117,10 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
  group_ptr_.clear();
  group_ptr_.resize(h_num_runs_out + 1, 0);
  dh::XGBCachingDeviceAllocator<char> alloc;
+
  thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(),
                         cnt.begin() + h_num_runs_out, cnt.begin());
+
  thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out,
               group_ptr_.begin() + 1);
 }
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@ -244,6 +244,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
                  &offset[ridx]),
              static_cast<unsigned long long>(cnt));  // NOLINT
  });
+
  dh::XGBCachingDeviceAllocator<char> alloc;
  bst_idx_t row_stride =
      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@ -1,7 +1,7 @@
 /**
 * Copyright 2019-2023, XGBoost contributors
 */
-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 #include "ellpack_page.h"

@ -52,5 +52,4 @@ size_t EllpackPage::Size() const {
  return impl_->Cuts();
 }
 }  // namespace xgboost
-
-#endif  // XGBOOST_USE_CUDA
+#endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@ -18,6 +18,10 @@
 #include "gradient_index.h"
 #include "xgboost/data.h"

+#if defined(XGBOOST_USE_HIP)
+#include <rocprim/rocprim.hpp>
+#endif
+
 namespace xgboost {

 EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {}
@ -251,6 +255,8 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
  // Go one level down into cub::DeviceScan API to set OffsetT as 64 bit
  // So we don't crash on n > 2^31
  size_t temp_storage_bytes = 0;
+
+#if defined(__CUDACC__)
  using DispatchScan =
      cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
                        TupleScanOp<Tuple>, cub::NullType, int64_t>;
@ -273,6 +279,17 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
                         key_value_index_iter, out, TupleScanOp<Tuple>(),
                         cub::NullType(), batch.Size(), nullptr, false);
 #endif
+
+#elif defined (__HIPCC__)
+
+  rocprim::inclusive_scan(nullptr, temp_storage_bytes, key_value_index_iter, out, batch.Size(), TupleScanOp<Tuple>());
+
+  dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
+
+  rocprim::inclusive_scan(temp_storage.data().get(), temp_storage_bytes, key_value_index_iter, out, batch.Size(),
+       TupleScanOp<Tuple>());
+
+#endif
 }

 void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
@ -526,11 +543,13 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
    // copy data entries to device.
    if (row_batch.data.DeviceCanRead()) {
      auto const& d_data = row_batch.data.ConstDeviceSpan();
+
      dh::safe_cuda(cudaMemcpyAsync(
          entries_d.data().get(), d_data.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
    } else {
      const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
+
      dh::safe_cuda(cudaMemcpyAsync(
          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@ -45,7 +45,7 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
  void Fetch() final;
 };

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline void EllpackPageSource::Fetch() {
  // silent the warning about unused variables.
  (void)(row_stride_);
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@ -65,12 +65,12 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 GHistIndexMatrix::GHistIndexMatrix(Context const *, MetaInfo const &, EllpackPage const &,
                                   BatchParam const &) {
  common::AssertGPUSupport();
 }
-#endif  // defined(XGBOOST_USE_CUDA)
+#endif  // defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 GHistIndexMatrix::~GHistIndexMatrix() = default;

--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@ -356,7 +356,7 @@ BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(Context const* ctx,
  return BatchSet<ExtSparsePage>(begin_iter);
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, DataIterHandle, float,
                                           std::shared_ptr<DMatrix>) {
  // silent the warning about unused variables.
@ -376,5 +376,5 @@ inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const*,
 inline void GetCutsFromEllpack(EllpackPage const&, common::HistogramCuts*) {
  common::AssertGPUSupport();
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace xgboost::data
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@ -46,6 +46,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  bst_feature_t cols = 0;

  int32_t current_device;
+
  dh::safe_cuda(cudaGetDevice(&current_device));
  auto get_device = [&]() {
    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
@ -84,7 +85,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
                          }));
+
    nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
+
    batches++;
  } while (iter.Next());
  iter.Reset();
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@ -43,7 +43,7 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char
 namespace cuda_impl {
 std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
                                                float) {
  return nullptr;
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@ -41,10 +41,10 @@ class DMatrixProxy : public DMatrix {
  std::any batch_;
  Context ctx_;

-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  void FromCudaColumnar(StringView interface_str);
  void FromCudaArray(StringView interface_str);
-#endif  // defined(XGBOOST_USE_CUDA)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

 public:
  DeviceOrd Device() const { return ctx_.Device(); }
@ -52,7 +52,7 @@ class DMatrixProxy : public DMatrix {
  void SetCUDAArray(char const* c_interface) {
    common::AssertGPUSupport();
    CHECK(c_interface);
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    StringView interface_str{c_interface};
    Json json_array_interface = Json::Load(interface_str);
    if (IsA<Array>(json_array_interface)) {
@ -60,7 +60,7 @@ class DMatrixProxy : public DMatrix {
    } else {
      this->FromCudaArray(interface_str);
    }
-#endif  // defined(XGBOOST_USE_CUDA)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  }

  void SetColumnarData(StringView interface_str);
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@ -15,14 +15,28 @@

 namespace xgboost::data {

+#if defined(XGBOOST_USE_CUDA)
 template <typename AdapterBatchT>
 struct COOToEntryOp {
  AdapterBatchT batch;
+
  __device__ Entry operator()(size_t idx) {
    const auto& e = batch.GetElement(idx);
    return Entry(e.column_idx, e.value);
  }
 };
+#elif defined(XGBOOST_USE_HIP)
+template <typename AdapterBatchT>
+struct COOToEntryOp : thrust::unary_function<size_t, Entry> {
+  AdapterBatchT batch;
+  COOToEntryOp(AdapterBatchT batch): batch(batch) {};
+
+  __device__ Entry operator()(size_t idx) {
+    const auto& e = batch.GetElement(idx);
+    return Entry(e.column_idx, e.value);
+  }
+};
+#endif

 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@ -19,7 +19,7 @@ const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
 namespace detail {
 // Use device dispatch
 std::size_t NSamplesDevice(DMatrixProxy *)  // NOLINT
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 ;  // NOLINT
 #else
 {
@ -28,7 +28,7 @@ std::size_t NSamplesDevice(DMatrixProxy *)  // NOLINT
 }
 #endif
 std::size_t NFeaturesDevice(DMatrixProxy *)  // NOLINT
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 ;  // NOLINT
 #else
 {
@ -194,11 +194,11 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
  return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(begin_iter));
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const *, const BatchParam &) {
  common::AssertGPUSupport();
  auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace xgboost::data
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
+}  // namespace data
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@ -295,7 +295,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
  }
 };

-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 // Push data from CUDA.
 void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
 #else
--- a/src/data/validation.h
+++ b/src/data/validation.h
@ -13,7 +13,7 @@ namespace xgboost {
 namespace data {
 struct LabelsCheck {
  XGBOOST_DEVICE bool operator()(float y) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
    return ::isnan(y) || ::isinf(y);
 #else
    return std::isnan(y) || std::isinf(y);
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -103,7 +103,7 @@ void GBTree::Configure(Args const& cfg) {
    cpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", this->ctx_));
  }
  cpu_predictor_->Configure(cfg);
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  auto n_gpus = common::AllVisibleGPUs();
  if (!gpu_predictor_) {
    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
@ -111,7 +111,7 @@ void GBTree::Configure(Args const& cfg) {
  if (n_gpus != 0) {
    gpu_predictor_->Configure(cfg);
  }
-#endif  // defined(XGBOOST_USE_CUDA)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

 #if defined(XGBOOST_USE_SYCL)
  if (!sycl_predictor_) {
@ -150,7 +150,7 @@ void GBTree::Configure(Args const& cfg) {

 void GPUCopyGradient(Context const*, linalg::Matrix<GradientPair> const*, bst_group_t,
                     linalg::Matrix<GradientPair>*)
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    ;  // NOLINT
 #else
 {
@ -640,7 +640,7 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
 */
 void GPUDartPredictInc(common::Span<float>, common::Span<float>, float, size_t, bst_group_t,
                       bst_group_t)
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    ;  // NOLINT
 #else
 {
@ -652,7 +652,7 @@ void GPUDartInplacePredictInc(common::Span<float> /*out_predts*/, common::Span<f
                              float /*tree_w*/, size_t /*n_rows*/,
                              linalg::TensorView<float const, 1> /*base_score*/,
                              bst_group_t /*n_groups*/, bst_group_t /*group*/)
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    ;  // NOLINT
 #else
 {
--- a/src/linear/linear_updater.cc
+++ b/src/linear/linear_updater.cc
@ -30,8 +30,8 @@ DMLC_REGISTER_PARAMETER(LinearTrainParam);
 // List of files that will be force linked in static links.
 DMLC_REGISTRY_LINK_TAG(updater_shotgun);
 DMLC_REGISTRY_LINK_TAG(updater_coordinate);
-#ifdef XGBOOST_USE_CUDA
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 DMLC_REGISTRY_LINK_TAG(updater_gpu_coordinate);
-#endif  // XGBOOST_USE_CUDA
+#endif  // XGBOOST_USE_CUDA, XGBOOST_USE_HIP
 }  // namespace linear
 }  // namespace xgboost
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -86,6 +86,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
      auto col = page[fidx];
      auto seg = column_segments[fidx];
+
      dh::safe_cuda(cudaMemcpy(
          data_.data().get() + row_ptr_[fidx],
          col.data() + seg.first,
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@ -377,7 +377,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
 .describe("Receiver Operating Characteristic Area Under the Curve.")
 .set_body([](const char*) { return new EvalROCAUC(); });

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 std::tuple<double, double, double> GPUBinaryROCAUC(Context const *, common::Span<float const>,
                                                   MetaInfo const &,
                                                   std::shared_ptr<DeviceAUCCache> *) {
@ -398,7 +398,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *, common::Span<flo
  return {};
 }
 struct DeviceAUCCache {};
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 class EvalPRAUC : public EvalAUC<EvalPRAUC> {
  std::shared_ptr<DeviceAUCCache> d_cache_;
@ -454,7 +454,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
    .describe("Area under PR curve for both classification and rank.")
    .set_body([](char const *) { return new EvalPRAUC{}; });

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 std::tuple<double, double, double> GPUBinaryPRAUC(Context const *, common::Span<float const>,
                                                  MetaInfo const &,
                                                  std::shared_ptr<DeviceAUCCache> *) {
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@ -5,7 +5,13 @@

 #include <algorithm>
 #include <cassert>
+
+#if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>  // NOLINT
+#elif defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>  // NOLINT
+#endif
+
 #include <limits>
 #include <memory>
 #include <tuple>
@ -121,10 +127,12 @@ std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
  auto uni_key = dh::MakeTransformIterator<float>(
      thrust::make_counting_iterator(0),
      [=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; });
+
  auto end_unique = thrust::unique_by_key_copy(
      thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
      dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
      dh::tbegin(d_unique_idx));
+
  d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));

  dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp),
@ -164,7 +172,9 @@ std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
      });

  Pair last = cache->fptp.back();
+
  double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
+
  return std::make_tuple(last.first, last.second, auc);
 }

@ -219,6 +229,7 @@ double ScaleClasses(Context const *ctx, common::Span<double> results,

  double tp_sum;
  double auc_sum;
+
  thrust::tie(auc_sum, tp_sum) =
      thrust::reduce(ctx->CUDACtx()->CTP(), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0},
                     PairPlus<double, double>{});
@ -301,6 +312,7 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
        double auc = area_fn(fp_prev, fp, tp_prev, tp, class_id);
        return auc;
      });
+
  thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
                        key_in + d_unique_idx.size(), val_in,
                        thrust::make_discard_iterator(), dh::tbegin(d_auc));
@ -374,6 +386,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
  // unique values are sparse, so we need a CSR style indptr
  dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
  auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
+
  auto n_uniques = dh::SegmentedUniqueByKey(
      thrust::cuda::par(alloc),
      dh::tbegin(d_class_ptr),
@ -384,6 +397,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
      d_unique_class_ptr.data(),
      dh::tbegin(d_unique_idx),
      thrust::equal_to<thrust::pair<uint32_t, float>>{});
+
  d_unique_idx = d_unique_idx.subspan(0, n_uniques);

  auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; };
@ -501,9 +515,11 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
  auto check_it = dh::MakeTransformIterator<size_t>(
      thrust::make_counting_iterator(0),
      [=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
+
  size_t n_valid = thrust::count_if(
      thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
      [=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
+
  if (n_valid < info.group_ptr_.size() - 1) {
    InvalidGroupAUC();
  }
@ -602,6 +618,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
   */
  double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
                              dh::tend(s_d_auc), 0.0);
+
  return std::make_pair(auc, n_valid);
 }

@ -629,6 +646,7 @@ std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
      });
  dh::XGBCachingDeviceAllocator<char> alloc;
  double total_pos, total_neg;
+
  thrust::tie(total_pos, total_neg) =
      thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(),
                     Pair{0.0, 0.0}, PairPlus<double, double>{});
@ -683,6 +701,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
        return thrust::make_pair(y * w, (1.0f - y) * w);
      });
  dh::XGBCachingDeviceAllocator<char> alloc;
+
  thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
                        key_it + predts.size(), val_it,
                        thrust::make_discard_iterator(), totals.begin(),
@ -755,6 +774,7 @@ std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
  // unique values are sparse, so we need a CSR style indptr
  dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
  auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
+
  auto n_uniques = dh::SegmentedUniqueByKey(
      thrust::cuda::par(alloc),
      dh::tbegin(d_group_ptr),
@ -765,6 +785,7 @@ std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
      d_unique_class_ptr.data(),
      dh::tbegin(d_unique_idx),
      thrust::equal_to<thrust::pair<uint32_t, float>>{});
+
  d_unique_idx = d_unique_idx.subspan(0, n_uniques);

  auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) {
@ -815,6 +836,7 @@ std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
          }
          return thrust::make_pair(0.0, static_cast<uint32_t>(1));
        });
+
    thrust::tie(auc, invalid_groups) = thrust::reduce(
        thrust::cuda::par(alloc), it, it + n_groups,
        thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
@ -848,10 +870,12 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,

  dh::XGBDeviceAllocator<char> alloc;
  auto labels = info.labels.View(ctx->Device());
+
  if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
                     dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
    InvalidLabels();
  }
+
  /**
   * Get total positive/negative for each group.
   */
@ -871,6 +895,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
        auto y = labels(i);
        return thrust::make_pair(y * w, (1.0 - y) * w);
      });
+
  thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
                        key_it + predts.size(), val_it,
                        thrust::make_discard_iterator(), totals.begin(),
--- a/src/metric/elementwise_metric.cc
+++ b/src/metric/elementwise_metric.cc
@ -3,6 +3,6 @@
 */
 // Dummy file to keep the CUDA conditional compile trick.

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 #include "elementwise_metric.cu"
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@ -22,14 +22,14 @@
 #include "xgboost/collective/result.h"  // for SafeColl
 #include "xgboost/metric.h"

-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 #include <thrust/execution_policy.h>  // thrust::cuda::par
 #include <thrust/functional.h>        // thrust::plus<>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_reduce.h>

 #include "../common/device_helpers.cuh"
-#endif  // XGBOOST_USE_CUDA
+#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

 namespace xgboost::metric {
 // tag the this file, used by force static link later.
@ -47,7 +47,7 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
  PackedReduceResult result;
  auto labels = info.labels.View(ctx->Device());
  if (ctx->IsCUDA()) {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    dh::XGBCachingDeviceAllocator<char> alloc;
    thrust::counting_iterator<size_t> begin(0);
    thrust::counting_iterator<size_t> end = begin + labels.Size();
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Hendrik Groove	9122d959e8	revert	2024-10-22 01:21:22 +02:00
Hendrik Groove	c78b0b752a	remove async	2024-10-22 01:18:12 +02:00
Hendrik Groove	54e93c7a0b	switch to default stream	2024-10-22 01:14:29 +02:00
Hendrik Groove	2bbb8b3786	revert	2024-10-22 00:58:01 +02:00
Hendrik Groove	55f995fc50	remove async	2024-10-22 00:53:56 +02:00
Hendrik Groove	cbaf5511ac	test	2024-10-22 00:51:34 +02:00
Hendrik Groove	59cc283242	use hip functions	2024-10-22 00:46:07 +02:00
Hendrik Groove	038d61a802	try other stream	2024-10-22 00:14:00 +02:00
Hendrik Groove	8fb2258a2f	reset	2024-10-22 00:10:40 +02:00
Hendrik Groove	02882a4b33	reset regression_obj.cu	2024-10-22 00:05:53 +02:00
Hendrik Groove	0d92f6ca9c	hipStreamDefault	2024-10-21 23:58:27 +02:00
Hendrik Groove	2a554ba4a7	reset device helper	2024-10-21 23:53:19 +02:00
Hendrik Groove	1c666db349	reset learner	2024-10-21 23:49:22 +02:00
Hendrik Groove	1931a70598	stream logic	2024-10-21 23:24:10 +02:00
Hendrik Groove	20a9c223b6	remove logging	2024-10-21 21:52:34 +02:00
Hendrik Groove	9659d0e7bd	WARP_SIZE	2024-10-21 21:35:25 +02:00
Hendrik Groove	768c8b298c	remove logging	2024-10-21 21:24:47 +02:00
Hendrik Groove	94ffd57641	try direct instantiation of EvaluateSplitsKernel	2024-10-21 21:22:57 +02:00
Hendrik Groove	ee17a5a26c	try compile option	2024-10-21 21:19:11 +02:00
Hendrik Groove	8c15f3b665	revert	2024-10-21 11:43:29 +02:00
Hendrik Groove	bb2feab0b2	try	2024-10-21 01:55:41 +02:00
Hendrik Groove	1b6c6baf76	restore stream logic	2024-10-21 01:39:43 +02:00
Hendrik Groove	b3ee7a59c7	try other stream	2024-10-21 01:33:00 +02:00
Hendrik Groove	ea3e7adcdc	hipStreamPerThread	2024-10-21 01:16:40 +02:00
Hendrik Groove	807ee5da88	fix	2024-10-21 00:39:06 +02:00
Hendrik Groove	a135895f3a	fix	2024-10-21 00:35:28 +02:00
Hendrik Groove	ed1636b9c0	fix	2024-10-21 00:32:33 +02:00
Hendrik Groove	1f4154d756	fix	2024-10-21 00:29:38 +02:00
Hendrik Groove	86fcbaf0e5	fix	2024-10-21 00:26:25 +02:00
Hendrik Groove	0d600b4535	try to change stream	2024-10-21 00:25:18 +02:00
Hendrik Groove	6fcffef7dc	fix	2024-10-21 00:18:25 +02:00
Hendrik Groove	ca6fcd361e	fix	2024-10-21 00:13:27 +02:00
Hendrik Groove	c39ad981ce	fix	2024-10-21 00:11:08 +02:00
Hendrik Groove	1de5734d4c	more logging	2024-10-21 00:08:50 +02:00
Hendrik Groove	e2e6b6e71f	more logging	2024-10-21 00:06:21 +02:00
Hendrik Groove	db66fad9e9	SumReduction logging	2024-10-20 23:27:50 +02:00
Hendrik Groove	bf2ef6c586	log reduce function	2024-10-20 23:26:21 +02:00
Hendrik Groove	58a27ba968	more logging	2024-10-20 20:59:23 +02:00
Hendrik Groove	c964dd62b4	more logging	2024-10-20 20:53:50 +02:00
Hendrik Groove	4a10135006	validate label debug	2024-10-20 18:11:03 +02:00
Hendrik Groove	f54355f470	fix path	2024-10-20 17:56:27 +02:00
Hendrik Groove	08f3936bc9	fix path	2024-10-20 17:51:05 +02:00
Hendrik Groove	f50d5344f3	get gradient error logging	2024-10-20 17:40:52 +02:00
Hendrik Groove	ab41cd26a6	add gpu error check	2024-10-20 17:34:51 +02:00
Hendrik Groove	fd95be5f20	validate label logging	2024-10-20 17:32:22 +02:00
Hendrik Groove	60a3bea7c6	add logging	2024-10-20 17:30:17 +02:00
Hendrik Groove	7301022fed	logging	2024-10-20 17:05:34 +02:00
Hendrik Groove	288193cf82	try	2024-10-20 02:41:57 +02:00
Hendrik Groove	e142b52540	use new func	2024-10-20 02:18:41 +02:00
Hendrik Groove	e8fceb8198	add logging	2024-10-20 02:03:55 +02:00
Hendrik Groove	971d3ca8cd	array interface	2024-10-20 01:46:48 +02:00
Hendrik Groove	206f305b65	array interface	2024-10-20 01:28:40 +02:00
Hendrik Groove	8e703f3a5a	try hipHostMalloc	2024-10-20 01:13:44 +02:00
Hendrik Groove	0790bf7f8f	change back	2024-10-17 17:47:10 +02:00
Hendrik Groove	d8a92fe783	test	2024-10-17 17:42:37 +02:00
Hui Liu	bce48bffc6	Merge pull request #2 from hliuca/master-rocm Merge latest upstream changes	2024-04-23 09:50:11 -07:00
Hui Liu	45dc134151	merge changes from upstream	2024-04-22 14:22:16 -07:00
Hui Liu	b27f35e270	rm hip from src	2024-04-22 12:31:14 -07:00
Hui Liu	8b75204fed	merge latest change from upstream	2024-04-22 09:35:31 -07:00
Hui Liu	42edd78f30	update rocgputreeshap	2024-04-09 12:47:57 -07:00
Hui Liu	ff549ae933	sync upstream code	2024-03-20 16:14:38 -07:00
Hui Liu	3ad7461ddc	add ROCm installation	2024-03-12 09:53:10 -07:00
Hui Liu	fe36d96247	add ROCm installation	2024-03-12 09:52:53 -07:00
Hui Liu	968dbf25fb	merge latest changes	2024-03-12 09:13:09 -07:00
Hui Liu	2f6027525d	Merge branch 'sync-2024Jan24' into master-rocm	2024-02-01 14:49:27 -08:00
Hui Liu	44db1cef54	Merge branch 'master' into sync-2024Jan24	2024-02-01 14:41:48 -08:00
Hui Liu	cf8c7e63af	Merge branch 'sync-2024Jan24' into master-rocm	2024-01-26 15:47:12 -08:00
Hui Liu	2cb579ff3c	fix memory type	2024-01-26 15:46:42 -08:00
Hui Liu	e3e3e34cd2	merge latest changes	2024-01-24 13:41:05 -08:00
Hui Liu	3fe874078c	merge latest changes	2024-01-24 13:30:08 -08:00
Hui Liu	74677e4e9d	use __HIPCC__ for device code	2024-01-24 11:57:58 -08:00
Hui Liu	069cf1d019	use __HIPCC__ for device code	2024-01-24 11:30:01 -08:00
Hui Liu	1e0ccf7b87	fix random	2024-01-21 12:48:41 -08:00
Hui Liu	9759e28e6a	compiler errors fix	2024-01-12 12:09:01 -08:00
Hui Liu	1e1e8be3a5	merge latest, Jan 12 2024	2024-01-12 09:57:11 -08:00
Hui Liu	c42c7d99f1	fix memoryType	2024-01-11 14:10:30 -08:00
Hui Liu	fd3ad29dc4	workaround memoryType and change rccl config	2024-01-11 14:03:05 -08:00
Hui Liu	2d7ffbdf3d	merge latest changes	2023-12-13 21:06:28 -08:00
Hui Liu	c81731308c	fix RCCL	2023-11-02 16:39:24 -07:00
Hui Liu	51efb7442e	support HIP for half in coll	2023-11-02 10:53:12 -07:00
Hui Liu	3af5dfd546	Merge branch 'master'	2023-11-02 09:05:31 -07:00
Hui Liu	129bb76941	enable federated	2023-10-31 16:31:56 -07:00
Hui Liu	123af45327	Merge branch 'master'	2023-10-31 15:59:31 -07:00
Hui Liu	6ac806fefd	Merge branch 'master'	2023-10-31 09:05:56 -07:00
Hui Liu	8fab17ae8f	rm hip.h files	2023-10-30 21:20:28 -07:00
Hui Liu	9b7aa1a7cd	unify cuda to hip	2023-10-30 17:12:06 -07:00
Hui Liu	4eb371b3f0	unify cuda to hip	2023-10-30 17:10:06 -07:00
Hui Liu	6df27eadc9	rm hip_category from source	2023-10-30 16:34:49 -07:00
Hui Liu	02f5464fa6	enable coll and comm	2023-10-30 15:15:05 -07:00
Hui Liu	b6b5218245	enable RCCL	2023-10-30 14:05:04 -07:00
Hui Liu	d7f1235b7d	Merge branch 'master' into sync-condition-2023Oct11	2023-10-30 13:19:33 -07:00
Hui Liu	1bedd76e94	rm un-necessary code	2023-10-30 13:14:45 -07:00
Hui Liu	40dc263602	enable ROCm for jvm and R	2023-10-30 12:52:44 -07:00
Hui Liu	32ae49ab92	temp hack for multi GPUs	2023-10-27 13:00:49 -07:00
Hui Liu	6bbca9a8b7	restore learner	2023-10-27 11:15:06 -07:00
Hui Liu	6762230d9a	namespace to reduce code	2023-10-27 10:51:32 -07:00
Hui Liu	4302200a33	Merge branch 'master' into sync-condition-2023Oct11	2023-10-27 10:09:37 -07:00
Hui Liu	4a4b528d54	add namespace aliases to reduce code	2023-10-27 09:11:55 -07:00
Hui Liu	e00131c465	Merge branch 'master' into sync-condition-2023Oct11	2023-10-26 11:35:48 -07:00
Hui Liu	cd28b9f997	add back per-thread	2023-10-24 15:17:19 -07:00
Hui Liu	3752b06550	Merge branch 'master' into sync-condition-2023Oct11	2023-10-24 10:46:38 -07:00
Hui Liu	79319dfd4d	format	2023-10-23 22:29:48 -07:00
Hui Liu	558352afc9	fix stream	2023-10-23 21:51:20 -07:00
Hui Liu	24be98f61f	Merge branch 'master' into sync-condition-2023Oct11	2023-10-23 21:29:54 -07:00
Hui Liu	65012b356c	rm some hip	2023-10-23 17:13:02 -07:00
Hui Liu	f9f39b092b	add HIP LIB PATH	2023-10-23 16:52:33 -07:00
Hui Liu	643b334919	add nccl_device_communicator.hip	2023-10-23 16:43:03 -07:00
Hui Liu	6ba66463b6	fix uuid and Clear/SetValid	2023-10-23 16:32:26 -07:00
Hui Liu	55994b1ac7	enable ROCm on latest XGBoost	2023-10-23 11:15:04 -07:00
Hui Liu	15421e40d9	enable ROCm on latest XGBoost	2023-10-23 11:07:08 -07:00
Your Name	fb19e15ce3	rm setup.py	2023-10-19 11:59:19 -07:00
Your Name	ffbbc9c968	add cuda to hip wrapper	2023-10-17 12:42:37 -07:00
Your Name	ea19555474	temp merge, disable 1 line, SetValid	2023-10-12 16:16:44 -07:00
Hui Liu	85d3017ca5	Merge pull request #1 from ROCmSoftwarePlatform/create-pull-request/update-rapids [CI] Update RAPIDS to latest stable	2023-09-05 13:10:11 -07:00
amdsc21	5929890174	[CI] Update RAPIDS to latest stable	2023-08-10 20:02:16 +00:00
amdsc21	2e7e9d3b2d	update rocgputreeshap branch	2023-06-23 19:50:08 +02:00
amdsc21	3e0c7d1dee	new url for rocgputreeshap	2023-06-23 19:46:45 +02:00
amdsc21	2f47a1ebe6	rm warp-primitives	2023-06-22 21:43:00 +02:00
amdsc21	5ca7daaa13	merge latest changes	2023-06-15 21:39:14 +02:00
amdsc21	5f78360949	merge changes Jun092023	2023-06-09 22:41:33 +02:00
amdsc21	35cde3b1b2	remove some hip.h	2023-06-07 04:48:09 +02:00
amdsc21	ce345c30a8	remove some hip.h	2023-06-07 03:39:01 +02:00
amdsc21	af8845405a	sync Jun 5	2023-06-07 02:43:21 +02:00
amdsc21	9ee1852d4e	restore device helper	2023-06-02 02:55:13 +02:00
Your Name	6ecd7903f2	Merge branch 'master' into sync-condition-2023Jun01	2023-06-01 15:58:31 -07:00
Your Name	42867a4805	sync Jun 1	2023-06-01 15:55:06 -07:00
amdsc21	c5b575e00e	fix host __assert_fail	2023-05-24 19:40:24 +02:00
amdsc21	1354138b7d	Merge branch 'master' into sync-condition-2023May15	2023-05-24 17:44:16 +02:00
amdsc21	b994a38b28	Merge branch 'master' into sync-condition-2023May15	2023-05-23 01:07:50 +02:00
amdsc21	3a834c4992	change workflow	2023-05-20 07:04:06 +02:00
amdsc21	b22644fc10	add hip.h	2023-05-20 01:25:33 +02:00
amdsc21	7663d47383	Merge branch 'master' into sync-condition-2023May15	2023-05-19 20:30:35 +02:00
amdsc21	88fc8badfa	Merge branch 'master' into sync-condition-2023May15	2023-05-17 19:55:50 +02:00
amdsc21	8cad8c693c	sync up May15 2023	2023-05-15 18:59:18 +02:00
amdsc21	b066accad6	fix lambdarank_obj	2023-05-02 21:06:22 +02:00
amdsc21	b324d51f14	fix array_interface.h half type	2023-05-02 20:50:50 +02:00
amdsc21	65097212b3	fix IterativeDeviceDMatrix, support HIP	2023-05-02 20:20:11 +02:00
amdsc21	4a24ca2f95	fix helpers.h, enable HIP	2023-05-02 20:04:23 +02:00
amdsc21	83e6fceb5c	fix lambdarank_obj.cc, support HIP	2023-05-02 19:03:18 +02:00
amdsc21	e4538cb13c	fix, to support hip	2023-05-02 17:43:11 +02:00
amdsc21	5446c501af	merge 23Mar01	2023-05-02 00:05:58 +02:00
amdsc21	313a74b582	add Shap Magic to check if use cat	2023-05-01 21:55:14 +02:00
amdsc21	65d83e288f	fix device query	2023-04-19 19:53:26 +02:00
amdsc21	f645cf51c1	Merge branch 'master' into sync-condition-2023Apr11	2023-04-17 18:33:00 +02:00
amdsc21	db8420225b	fix RCCL	2023-04-12 01:09:14 +02:00
amdsc21	843fdde61b	sync Apr 11 2023	2023-04-11 20:03:25 +02:00
amdsc21	08bc4b0c0f	Merge branch 'master' into sync-condition-2023Apr11	2023-04-11 19:38:38 +02:00
amdsc21	6825d986fd	move Dockerfile to ci	2023-04-11 19:34:23 +02:00
paklui	d155ec77f9	building docker for xgboost-amd-condition	2023-03-30 13:36:39 -07:00
amdsc21	991738690f	Merge branch 'sync-condition-2023Mar27' into amd-condition	2023-03-30 05:16:36 +02:00
amdsc21	aeb3fd1c95	Merge branch 'master' into sync-condition-2023Mar27	2023-03-30 05:15:55 +02:00
amdsc21	141a062e00	Merge branch 'sync-condition-2023Mar27' into amd-condition	2023-03-30 00:47:16 +02:00
amdsc21	acad01afc9	sync Mar 29	2023-03-30 00:46:50 +02:00
amdsc21	f289e5001d	Merge branch 'sync-condition-2023Mar27' into amd-condition	2023-03-28 00:24:12 +02:00
amdsc21	06d9b998ce	fix CAPI BuildInfo	2023-03-28 00:14:18 +02:00
amdsc21	c50cc424bc	sync Mar 27 2023	2023-03-27 18:54:41 +02:00
amdsc21	8c77e936d1	tune grid size	2023-03-26 17:45:19 +02:00
amdsc21	18034a4291	tune histogram	2023-03-26 01:42:51 +01:00
amdsc21	7ee4734d3a	rm device_helpers.hip.h from cu	2023-03-26 00:24:11 +01:00
amdsc21	ee582f03c3	rm device_helpers.hip.h from cuh	2023-03-25 23:35:57 +01:00
amdsc21	f3286bac04	rm warp header	2023-03-25 23:01:44 +01:00
amdsc21	3ee3bea683	fix warp header	2023-03-25 22:37:37 +01:00
amdsc21	5098735698	Merge branch 'condition-sync-Mar24-23' into hui-condition	2023-03-25 05:28:40 +01:00
amdsc21	e74b3bbf3c	fix macro	2023-03-25 05:17:39 +01:00
amdsc21	22525c002a	fix macro	2023-03-25 05:08:30 +01:00
amdsc21	80961039d7	fix macro	2023-03-25 05:00:55 +01:00
amdsc21	1474789787	add new file	2023-03-25 04:54:02 +01:00
amdsc21	1dc138404a	initial merge, fix linalg.h	2023-03-25 04:48:47 +01:00
amdsc21	e1d050f64e	initial merge, fix linalg.h	2023-03-25 04:37:43 +01:00
amdsc21	7fbc561e17	initial merge	2023-03-25 04:31:55 +01:00
amdsc21	d97be6f396	enable last 3 tests	2023-03-25 04:05:05 +01:00
amdsc21	f1211cffca	enable last 3 tests	2023-03-25 00:45:52 +01:00
amdsc21	e0716afabf	fix objective/objective.cc, CMakeFile and setup.py	2023-03-23 20:22:34 +01:00
amdsc21	595cd81251	add max shared mem workaround	2023-03-19 20:08:42 +01:00
amdsc21	0325ce0bed	update gputreeshap	2023-03-19 20:07:36 +01:00
amdsc21	a79a35c22c	add warp size	2023-03-15 22:00:26 +01:00
amdsc21	4484c7f073	disable Optin Shared Mem	2023-03-15 02:10:16 +01:00
amdsc21	8207015e48	fix ../tests/cpp/common/test_span.h	2023-03-14 22:19:06 +01:00
amdsc21	364df7db0f	fix ../tree/gpu_hist/evaluate_splits.hip bugs, size 64	2023-03-14 06:17:21 +01:00
amdsc21	a2bab03205	fix aft_obj.hip	2023-03-13 23:19:59 +01:00
amdsc21	b71c1b50de	fix macro, no !	2023-03-12 23:02:28 +01:00
amdsc21	fa2336fcfd	sort bug fix	2023-03-12 07:09:10 +01:00
amdsc21	7d96758382	macro format	2023-03-11 06:57:24 +01:00
amdsc21	b0dacc5a80	fix bug	2023-03-11 03:47:23 +01:00
amdsc21	f64152bf97	add helpers.hip	2023-03-11 02:56:50 +01:00
amdsc21	b4dbe7a649	fix isnan	2023-03-11 02:39:58 +01:00
amdsc21	e5b6219a84	typo	2023-03-11 02:30:27 +01:00
amdsc21	3a07b1edf8	complete test porting	2023-03-11 02:17:05 +01:00
amdsc21	9bf16a2ca6	testing porting	2023-03-11 01:38:54 +01:00
amdsc21	332f6a89a9	more tests	2023-03-11 01:33:48 +01:00
amdsc21	204d0c9a53	add hip tests	2023-03-11 00:38:16 +01:00
amdsc21	e961016e71	rm HIPCUB	2023-03-10 22:21:37 +01:00
amdsc21	f0b8c02f15	merge latest changes	2023-03-10 22:10:20 +01:00
amdsc21	5e8b1842b9	fix Pointer Attr	2023-03-10 19:06:02 +01:00
amdsc21	9f072b50ba	fix __popc	2023-03-10 17:14:31 +01:00
amdsc21	e1ddb5ae58	fix macro XGBOOST_USE_HIP	2023-03-10 07:11:05 +01:00
amdsc21	643e2a7b39	fix macro XGBOOST_USE_HIP	2023-03-10 07:09:41 +01:00
amdsc21	bde3107c3e	fix macro XGBOOST_USE_HIP	2023-03-10 07:01:25 +01:00
amdsc21	5edfc1e2e9	finish ellpack_page.cc	2023-03-10 06:41:25 +01:00
amdsc21	c073417d0c	finish aft_obj.cu	2023-03-10 06:39:03 +01:00
amdsc21	9bbbeb3f03	finish multiclass_obj.cu	2023-03-10 06:35:46 +01:00
amdsc21	4bde2e3412	finish multiclass_obj.cu	2023-03-10 06:35:21 +01:00
amdsc21	58a9fe07b6	finish multiclass_obj.cu	2023-03-10 06:35:06 +01:00
amdsc21	41407850d5	finish rank_obj.cu	2023-03-10 06:29:08 +01:00
amdsc21	968a1db4c0	finish regression_obj.cu	2023-03-10 06:07:53 +01:00
amdsc21	ad710e4888	finish hinge.cu	2023-03-10 06:04:59 +01:00
amdsc21	4e3c699814	finish adaptive.cu	2023-03-10 06:02:48 +01:00
amdsc21	757de84398	finish quantile.cu	2023-03-10 05:55:51 +01:00
amdsc21	d27f9dfdce	finish host_device_vector.cu	2023-03-10 05:45:38 +01:00
amdsc21	14cc438a64	finish stats.cu	2023-03-10 05:38:16 +01:00
amdsc21	911a5d8a60	finish hist_util.cu	2023-03-10 05:32:38 +01:00
amdsc21	54b076b40f	finish common.cu	2023-03-10 05:20:29 +01:00
amdsc21	91a5ef762e	finish common.cu	2023-03-10 05:19:41 +01:00
amdsc21	8fd2af1c8b	finish numeric.cu	2023-03-10 05:16:23 +01:00
amdsc21	bb6adda8a3	finish c_api.cu	2023-03-10 05:12:51 +01:00
amdsc21	a76ccff390	finish c_api.cu	2023-03-10 05:11:20 +01:00
amdsc21	61c0b19331	finish ellpack_page_source.cu	2023-03-10 05:06:36 +01:00
amdsc21	fa9f69dd85	finish sparse_page_dmatrix.cu	2023-03-10 05:04:57 +01:00
amdsc21	080fc35c4b	finish ellpack_page_raw_format.cu	2023-03-10 05:02:35 +01:00
amdsc21	ccce4cf7e1	finish data.cu	2023-03-10 05:00:57 +01:00
amdsc21	713ab9e1a0	finish sparse_page_source.cu	2023-03-10 04:42:56 +01:00
amdsc21	134cbfddbe	finish gradient_index.cu	2023-03-10 04:40:33 +01:00
amdsc21	6e2c5be83e	finish array_interface.cu	2023-03-10 04:36:04 +01:00
amdsc21	185dbce21f	finish ellpack_page.cu	2023-03-10 04:26:09 +01:00
amdsc21	49732359ef	finish iterative_dmatrix.cu	2023-03-10 03:47:00 +01:00
amdsc21	ec9f500a49	finish proxy_dmatrix.cu	2023-03-10 03:40:07 +01:00
amdsc21	53244bef6f	finish simple_dmatrix.cu	2023-03-10 03:38:09 +01:00
amdsc21	f0febfbcac	finish gpu_predictor.cu	2023-03-10 01:29:54 +01:00
amdsc21	1c58ff61d1	finish fit_stump.cu	2023-03-10 00:46:29 +01:00
amdsc21	1530c03f7d	finish constraints.cu	2023-03-09 22:43:51 +01:00
amdsc21	309268de02	finish updater_gpu_hist.cu	2023-03-09 22:40:44 +01:00
amdsc21	500428cc0f	finish row_partitioner.cu	2023-03-09 22:31:11 +01:00
amdsc21	495816f694	finished gradient_based_sampler.cu	2023-03-09 22:26:08 +01:00
amdsc21	df42dd2c53	finished evaluator.cu	2023-03-09 22:22:05 +01:00
amdsc21	f55243fda0	finish evaluate_splits.cu	2023-03-09 22:15:10 +01:00
amdsc21	1e09c21456	finished feature_groups.cu	2023-03-09 21:31:00 +01:00
amdsc21	0ed5d3c849	finished histogram.cu	2023-03-09 21:28:37 +01:00
amdsc21	f67e7de7ef	finished communicator.cu	2023-03-09 21:02:48 +01:00
amdsc21	5044713388	finished updater_gpu_coordinate.cu	2023-03-09 20:53:54 +01:00
amdsc21	c875f0425f	finished rank_metric.cu	2023-03-09 20:48:31 +01:00
amdsc21	4fd08b6c32	finished survival_metric.cu	2023-03-09 20:41:52 +01:00
amdsc21	b9d86d44d6	finish multiclass_metric.cu	2023-03-09 20:37:16 +01:00
amdsc21	a56055225a	fix auc.cu	2023-03-09 20:29:38 +01:00
amdsc21	6eba0a56ec	fix CMakeLists.txt	2023-03-09 18:57:14 +01:00
amdsc21	00c24a58b1	finish elementwise_metric.cu	2023-03-08 22:50:07 +01:00
amdsc21	6fa248b75f	try elementwise_metric.cu	2023-03-08 22:42:48 +01:00
amdsc21	946f9e9802	fix gbtree.cc	2023-03-08 21:44:20 +01:00
amdsc21	4c4e5af29c	port elementwise_metric.cu	2023-03-08 21:39:56 +01:00
amdsc21	7e1b06417b	finish gbtree.cu porting	2023-03-08 21:09:56 +01:00
amdsc21	cdd7794641	add unused option	2023-03-08 20:37:53 +01:00
amdsc21	cd743a1ae9	fix DispatchRadixSort	2023-03-08 20:31:23 +01:00
amdsc21	a45005863b	fix DispatchScan	2023-03-08 20:15:33 +01:00
amdsc21	bdcb036592	add context.hip	2023-03-08 07:34:19 +01:00
amdsc21	7a3a9b682a	add device_helpers.hip.h	2023-03-08 07:18:33 +01:00
amdsc21	0a711662c3	add device_helpers.hip.h	2023-03-08 07:10:32 +01:00
amdsc21	312e58ec99	enable rocm, fix common.h	2023-03-08 06:45:03 +01:00
amdsc21	ca8f4e7993	enable rocm, fix stats.cuh	2023-03-08 06:43:06 +01:00
amdsc21	60795f22de	enable rocm, fix linalg_op.cuh	2023-03-08 06:42:20 +01:00
amdsc21	05fdca893f	enable rocm, fix cuda_pinned_allocator.h	2023-03-08 06:39:40 +01:00
amdsc21	d8cc93f3f2	enable rocm, fix algorithm.cuh	2023-03-08 06:38:35 +01:00
amdsc21	62c4efac51	enable rocm, fix transform.h	2023-03-08 06:37:34 +01:00
amdsc21	ba9e00d911	enable rocm, fix hist_util.cuh	2023-03-08 06:36:15 +01:00
amdsc21	d3be67ad8e	enable rocm, fix quantile.cuh	2023-03-08 06:32:09 +01:00
amdsc21	2eb0b6aae4	enable rocm, fix threading_utils.cuh	2023-03-08 06:30:52 +01:00
amdsc21	327f1494f1	enable rocm, fix cuda_context.cuh	2023-03-08 06:29:45 +01:00
amdsc21	fa92aa56ee	enable rocm, fix device_adapter.cuh	2023-03-08 06:26:31 +01:00
amdsc21	427f6c2a1a	enable rocm, fix simple_dmatrix.cuh	2023-03-08 06:24:34 +01:00
amdsc21	270c7b4802	enable rocm, fix row_partitioner.cuh	2023-03-08 06:22:25 +01:00
amdsc21	0fc1f640a9	enable rocm, fix nccl_device_communicator.cuh	2023-03-08 06:18:13 +01:00
amdsc21	762fd9028d	enable rocm, fix device_communicator_adapter.cuh	2023-03-08 06:13:29 +01:00
amdsc21	f2009533e1	rm hip.h	2023-03-08 06:04:01 +01:00
amdsc21	53b5cd73f2	add hip flags	2023-03-08 03:42:51 +01:00
amdsc21	52b05d934e	add hip	2023-03-08 03:32:19 +01:00
amdsc21	840f15209c	add HIP flags, common	2023-03-08 03:11:49 +01:00
amdsc21	1e1c7fd8d5	add HIP flags, c_api	2023-03-08 01:34:37 +01:00
amdsc21	f5f800c80d	add HIP flags	2023-03-08 01:33:38 +01:00
amdsc21	6b7be96373	add HIP flags	2023-03-08 01:22:25 +01:00
amdsc21	75712b9c3c	enable HIP flags	2023-03-08 01:10:07 +01:00
amdsc21	ed45aa2816	Merge branch 'master' into dev-hui	2023-03-08 00:39:33 +01:00
amdsc21	f286ae5bfa	add hip rocthrust hipcub	2023-03-07 06:35:00 +01:00
amdsc21	f13a7f8d91	add submodules	2023-03-07 05:44:24 +01:00
amdsc21	c51a1c9aae	rename hip.cc to hip	2023-03-07 05:39:53 +01:00
amdsc21	30de728631	fix hip.cc	2023-03-07 05:11:42 +01:00
amdsc21	75fa15b36d	add hip support	2023-03-07 04:02:49 +01:00
amdsc21	eb30cb6293	add hip support	2023-03-07 03:49:52 +01:00
amdsc21	cafbfce51f	add hip.h	2023-03-07 03:46:26 +01:00
amdsc21	6039a71e6c	add hip structure	2023-03-07 02:17:19 +01:00
				`@ -0,0 +1 @@`
				`Subproject commit 187e4be94513c71bea1e10a3eded6b9b2da0521f`