fix memory type

merge v2.0.3 from upstream
use __HIPCC__ for device code
2024-01-26 15:45:04 -08:00 · 2024-01-25 07:40:06 -08:00 · 2024-01-24 12:32:51 -08:00 · 2024-01-11 13:13:14 -08:00 · 2023-12-18 23:49:41 -08:00 · 2023-12-14 17:54:05 -08:00
317 changed files with 4079 additions and 908 deletions
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@ -51,14 +51,14 @@ jobs:
      id: extract_branch
      if: |
        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
-        matrix.os == 'windows-latest'
+        (matrix.os == 'windows-latest' || matrix.os == 'macos-11')

    - name: Publish artifact xgboost4j.dll to S3
      run: |
        cd lib/
        Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
        dir
-        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
+        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
      if: |
        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
        matrix.os == 'windows-latest'
@ -66,6 +66,19 @@ jobs:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}

+    - name: Publish artifact libxgboost4j.dylib to S3
+      run: |
+        cd lib/
+        mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
+        ls
+        python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
+      if: |
+        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
+        matrix.os == 'macos-11'
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+

    - name: Test XGBoost4J (Core, Spark, Examples)
      run: |
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@ -25,7 +25,7 @@ jobs:
      with:
        submodules: 'true'

-    - uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1
+    - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
      with:
        r-version: ${{ matrix.config.r }}

@ -64,7 +64,7 @@ jobs:
      with:
        submodules: 'true'

-    - uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1
+    - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
      with:
        r-version: ${{ matrix.config.r }}

--- a/.gitmodules
+++ b/.gitmodules
@ -5,3 +5,6 @@
 [submodule "gputreeshap"]
 	path = gputreeshap
 	url = https://github.com/rapidsai/gputreeshap.git
+[submodule "rocgputreeshap"]
+	path = rocgputreeshap
+	url = https://github.com/ROCmSoftwarePlatform/rocgputreeshap
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -32,4 +32,3 @@ formats:
 python:
  install:
   - requirements: doc/requirements.txt
-  system_packages: true
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 2.0.0)
+project(xgboost LANGUAGES CXX C VERSION 2.0.3)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
@ -58,7 +58,7 @@ option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC
 option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
 option(GOOGLE_TEST "Build google tests" OFF)
 option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
-option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF)
+option(USE_DEVICE_DEBUG "Generate CUDA/HIP device debug info." OFF)
 option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
 option(RABIT_MOCK "Build rabit with mock" OFF)
@ -71,6 +71,10 @@ option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
 set(GPU_COMPUTE_VER "" CACHE STRING
  "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
+## HIP
+option(USE_HIP  "Build with GPU acceleration" OFF)
+option(USE_RCCL "Build with RCCL to enable distributed GPU support." OFF)
+option(BUILD_WITH_SHARED_RCCL "Build with shared RCCL library." OFF)
 ## Copied From dmlc
 option(USE_HDFS "Build with HDFS support" OFF)
 option(USE_AZURE "Build with AZURE support" OFF)
@ -93,6 +97,7 @@ option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
  message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
 endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
+
 if (USE_NCCL AND NOT (USE_CUDA))
  message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
 endif (USE_NCCL AND NOT (USE_CUDA))
@ -102,6 +107,17 @@ endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
 if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
  message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
 endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
+
+if (USE_RCCL AND NOT (USE_HIP))
+  message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
+endif (USE_RCCL AND NOT (USE_HIP))
+if (USE_DEVICE_DEBUG AND NOT (USE_HIP))
+  message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_HIP` flag.")
+endif (USE_DEVICE_DEBUG AND NOT (USE_HIP))
+if (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
+  message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
+endif (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
+
 if (JVM_BINDINGS AND R_LIB)
  message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
 endif (JVM_BINDINGS AND R_LIB)
@ -115,9 +131,15 @@ endif (USE_AVX)
 if (PLUGIN_LZ4)
  message(SEND_ERROR  "The option 'PLUGIN_LZ4' is removed from XGBoost.")
 endif (PLUGIN_LZ4)
+
 if (PLUGIN_RMM AND NOT (USE_CUDA))
  message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
 endif (PLUGIN_RMM AND NOT (USE_CUDA))
+
+if (PLUGIN_RMM AND NOT (USE_HIP))
+  message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_HIP` flag.")
+endif (PLUGIN_RMM AND NOT (USE_HIP))
+
 if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
  message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
 endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
@ -170,6 +192,24 @@ if (USE_CUDA)
  find_package(CUDAToolkit REQUIRED)
 endif (USE_CUDA)

+if (USE_HIP)
+  set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
+  # `export CXX=' is ignored by CMake HIP.
+  set(CMAKE_HIP_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  message(STATUS "Configured HIP host compiler: ${CMAKE_HIP_HOST_COMPILER}")
+
+  enable_language(HIP)
+  find_package(hip REQUIRED)
+  find_package(rocthrust REQUIRED)
+  find_package(hipcub REQUIRED)
+
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
+  add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
+endif (USE_HIP)
+
 if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
    ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
      (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
@ -209,6 +249,10 @@ if (USE_NCCL)
  find_package(Nccl REQUIRED)
 endif (USE_NCCL)

+if (USE_RCCL)
+  find_package(rccl REQUIRED)
+endif (USE_RCCL)
+
 # dmlc-core
 msvc_use_static_runtime()
 if (FORCE_SHARED_CRT)
@ -233,6 +277,11 @@ endif (RABIT_BUILD_MPI)
 add_subdirectory(${xgboost_SOURCE_DIR}/src)
 target_link_libraries(objxgboost PUBLIC dmlc)

+# Link -lstdc++fs for GCC 8.x
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
+  target_link_libraries(objxgboost PUBLIC stdc++fs)
+endif()
+
 # Exports some R specific definitions and objects
 if (R_LIB)
  add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 2.0.0.1
-Date: 2022-10-18
+Version: 2.0.3.1
+Date: 2023-12-14
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
         email = "tianqi.tchen@gmail.com"),
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
        i == env$begin_iteration ||
        i == env$end_iteration) {
      stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- format.eval.string(i, env$bst_evaluation, stdev)
+      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
      cat(msg, '\n')
    }
  }
@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
    if ((maximize && score > best_score) ||
        (!maximize && score < best_score)) {

-      best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
+      best_msg <<- .format_eval_string(
+        i, env$bst_evaluation, env$bst_evaluation_err
+      )
      best_score <<- score
      best_iteration <<- i
      best_ntreelimit <<- best_iteration * env$num_parallel_tree
@ -754,7 +756,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
 #

 # Format the evaluation metric string
-format.eval.string <- function(iter, eval_res, eval_err = NULL) {
+.format_eval_string <- function(iter, eval_res, eval_err = NULL) {
  if (length(eval_res) == 0)
    stop('no evaluation results')
  enames <- names(eval_res)
--- a/R-package/configure
+++ b/R-package/configure
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for xgboost 2.0.0.
+# Generated by GNU Autoconf 2.71 for xgboost 2.0.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='xgboost'
 PACKAGE_TARNAME='xgboost'
-PACKAGE_VERSION='2.0.0'
-PACKAGE_STRING='xgboost 2.0.0'
+PACKAGE_VERSION='2.0.3'
+PACKAGE_STRING='xgboost 2.0.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems.
+\`configure' configures xgboost 2.0.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@ -1287,7 +1287,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of xgboost 2.0.0:";;
+     short | recursive ) echo "Configuration of xgboost 2.0.3:";;
   esac
  cat <<\_ACEOF

@ -1367,7 +1367,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-xgboost configure 2.0.0
+xgboost configure 2.0.3
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by xgboost $as_me 2.0.0, which was
+It was created by xgboost $as_me 2.0.3, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by xgboost $as_me 2.0.0, which was
+This file was extended by xgboost $as_me 2.0.3, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-xgboost config.status 2.0.0
+xgboost config.status 2.0.3
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@ -2,7 +2,7 @@

 AC_PREREQ(2.69)

-AC_INIT([xgboost],[2.0.0],[],[xgboost],[])
+AC_INIT([xgboost],[2.0.3],[],[xgboost],[])

 : ${R_HOME=`R RHOME`}
 if test -z "${R_HOME}"; then
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@ -120,11 +120,25 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
  ctx.nthread = asInteger(n_threads);
  std::int32_t threads = ctx.Threads();

+  if (is_int) {
    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
      for (size_t j = 0; j < ncol; ++j) {
-      data[i * ncol + j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
+        auto v = iin[i + nrow * j];
+        if (v == NA_INTEGER) {
+          data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
+        } else {
+          data[i * ncol + j] = static_cast<float>(v);
+        }
      }
    });
+  } else {
+    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
+      for (size_t j = 0; j < ncol; ++j) {
+        data[i * ncol + j] = din[i + nrow * j];
+      }
+    });
+  }
+
  DMatrixHandle handle;
  CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
                                        asReal(missing), &handle, threads));
--- a/R-package/src/xgboost_custom.cc
+++ b/R-package/src/xgboost_custom.cc
@ -32,7 +32,7 @@ namespace common {
 bool CheckNAN(double v) {
  return ISNAN(v);
 }
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 double LogGamma(double v) {
  return lgammafn(v);
 }
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@ -56,6 +56,42 @@ test_that("xgb.DMatrix: basic construction", {
  expect_equal(raw_fd, raw_dgc)
 })

+test_that("xgb.DMatrix: NA", {
+  n_samples <- 3
+  x <- cbind(
+    x1 = sample(x = 4, size = n_samples, replace = TRUE),
+    x2 = sample(x = 4, size = n_samples, replace = TRUE)
+  )
+  x[1, "x1"] <- NA
+
+  m <- xgb.DMatrix(x)
+  xgb.DMatrix.save(m, "int.dmatrix")
+
+  x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
+  colnames(x) <- c("x1", "x2")
+  m <- xgb.DMatrix(x)
+
+  xgb.DMatrix.save(m, "float.dmatrix")
+
+  iconn <- file("int.dmatrix", "rb")
+  fconn <- file("float.dmatrix", "rb")
+
+  expect_equal(file.size("int.dmatrix"), file.size("float.dmatrix"))
+
+  bytes <- file.size("int.dmatrix")
+  idmatrix <- readBin(iconn, "raw", n = bytes)
+  fdmatrix <- readBin(fconn, "raw", n = bytes)
+
+  expect_equal(length(idmatrix), length(fdmatrix))
+  expect_equal(idmatrix, fdmatrix)
+
+  close(iconn)
+  close(fconn)
+
+  file.remove("int.dmatrix")
+  file.remove("float.dmatrix")
+})
+
 test_that("xgb.DMatrix: saving, loading", {
  # save to a local file
  dtest1 <- xgb.DMatrix(test_data, label = test_label)
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -181,6 +181,23 @@ function(xgboost_set_cuda_flags target)
    CUDA_RUNTIME_LIBRARY Static)
 endfunction(xgboost_set_cuda_flags)

+# Set HIP related flags to target.
+function(xgboost_set_hip_flags target)
+  if (USE_DEVICE_DEBUG)
+    target_compile_options(${target} PRIVATE
+      $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:HIP>>:-G>)
+  endif (USE_DEVICE_DEBUG)
+
+  target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
+  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
+  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)
+
+  set_target_properties(${target} PROPERTIES
+    HIP_STANDARD 17
+    HIP_STANDARD_REQUIRED ON
+    HIP_SEPARABLE_COMPILATION OFF)
+endfunction(xgboost_set_hip_flags)
+
 macro(xgboost_link_nccl target)
  if (BUILD_STATIC_LIB)
    target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
@ -193,6 +210,20 @@ macro(xgboost_link_nccl target)
  endif (BUILD_STATIC_LIB)
 endmacro(xgboost_link_nccl)

+macro(xgboost_link_rccl target)
+  if(BUILD_STATIC_LIB)
+    target_include_directories(${target} PUBLIC ${RCCL_INCLUDE_DIR}/rccl)
+    target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1)
+    target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
+    target_link_libraries(${target} PUBLIC ${RCCL_LIBRARY})
+  else()
+    target_include_directories(${target} PRIVATE ${RCCL_INCLUDE_DIR}/rccl)
+    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1)
+    target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
+    target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
+  endif()
+endmacro()
+
 # compile options
 macro(xgboost_target_properties target)
  set_target_properties(${target} PROPERTIES
@ -215,6 +246,10 @@ macro(xgboost_target_properties target)
      -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
      -Wall -Wextra -Wno-expansion-to-defined>
    )
+    target_compile_options(${target} PUBLIC
+       $<IF:$<COMPILE_LANGUAGE:HIP>,
+      -Wall -Wextra >
+    )
  endif(ENABLE_ALL_WARNINGS)

  target_compile_options(${target}
@ -283,6 +318,10 @@ macro(xgboost_target_link_libraries target)
    target_link_libraries(${target} PUBLIC CUDA::cudart_static)
  endif (USE_CUDA)

+  if (USE_HIP)
+    xgboost_set_hip_flags(${target})
+  endif (USE_HIP)
+
  if (PLUGIN_RMM)
    target_link_libraries(${target} PRIVATE rmm::rmm)
  endif (PLUGIN_RMM)
@ -291,6 +330,10 @@ macro(xgboost_target_link_libraries target)
    xgboost_link_nccl(${target})
  endif (USE_NCCL)

+  if(USE_RCCL)
+    xgboost_link_rccl(${target})
+  endif()
+
  if (USE_NVTX)
    target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
  endif (USE_NVTX)
--- a/cmake/xgboost-config.cmake.in
+++ b/cmake/xgboost-config.cmake.in
@ -3,6 +3,8 @@
 set(USE_OPENMP @USE_OPENMP@)
 set(USE_CUDA @USE_CUDA@)
 set(USE_NCCL @USE_NCCL@)
+set(USE_HIP @USE_HIP@)
+set(USE_RCCL @USE_RCCL@)
 set(XGBOOST_BUILD_STATIC_LIB @BUILD_STATIC_LIB@)

 include(CMakeFindDependencyMacro)
@ -15,6 +17,9 @@ if (XGBOOST_BUILD_STATIC_LIB)
  if(USE_CUDA)
    find_dependency(CUDA)
  endif()
+  if(USE_HIP)
+      find_dependency(HIP)
+  endif()
  # nccl should be linked statically if xgboost is built as static library.
 endif (XGBOOST_BUILD_STATIC_LIB)

--- a/demo/CLI/regression/runexp.sh
+++ b/demo/CLI/regression/runexp.sh
@ -4,13 +4,13 @@ python mapfeat.py
 # split train and test
 python mknfold.py machine.txt 1
 # training and output the models
-../../xgboost machine.conf
+../../../xgboost machine.conf
 # output predictions of test data
-../../xgboost machine.conf task=pred model_in=0002.model
+../../../xgboost machine.conf task=pred model_in=0002.model
 # print the boosters of 0002.model in dump.raw.txt
-../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
+../../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
 # print the boosters of 0002.model in dump.nice.txt with feature map
-../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt 
+../../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt 

 # cat the result
 cat dump.nice.txt
--- a/dev/change_scala_version.py
+++ b/dev/change_scala_version.py
@ -0,0 +1,79 @@
+import argparse
+import pathlib
+import re
+import shutil
+
+
+def main(args):
+    if args.scala_version == "2.12":
+        scala_ver = "2.12"
+        scala_patchver = "2.12.18"
+    elif args.scala_version == "2.13":
+        scala_ver = "2.13"
+        scala_patchver = "2.13.11"
+    else:
+        raise ValueError(f"Unsupported Scala version: {args.scala_version}")
+
+    # Clean artifacts
+    if args.purge_artifacts:
+        for target in pathlib.Path("jvm-packages/").glob("**/target"):
+            if target.is_dir():
+                print(f"Removing {target}...")
+                shutil.rmtree(target)
+
+    # Update pom.xml
+    for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):
+        print(f"Updating {pom}...")
+        with open(pom, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        with open(pom, "w", encoding="utf-8") as f:
+            replaced_scalaver = False
+            replaced_scala_binver = False
+            for line in lines:
+                for artifact in [
+                    "xgboost-jvm",
+                    "xgboost4j",
+                    "xgboost4j-gpu",
+                    "xgboost4j-spark",
+                    "xgboost4j-spark-gpu",
+                    "xgboost4j-flink",
+                    "xgboost4j-example",
+                ]:
+                    line = re.sub(
+                        f"<artifactId>{artifact}_[0-9\\.]*",
+                        f"<artifactId>{artifact}_{scala_ver}",
+                        line,
+                    )
+                # Only replace the first occurrence of scala.version
+                if not replaced_scalaver:
+                    line, nsubs = re.subn(
+                        r"<scala.version>[0-9\.]*",
+                        f"<scala.version>{scala_patchver}",
+                        line,
+                    )
+                    if nsubs > 0:
+                        replaced_scalaver = True
+                # Only replace the first occurrence of scala.binary.version
+                if not replaced_scala_binver:
+                    line, nsubs = re.subn(
+                        r"<scala.binary.version>[0-9\.]*",
+                        f"<scala.binary.version>{scala_ver}",
+                        line,
+                    )
+                    if nsubs > 0:
+                        replaced_scala_binver = True
+                f.write(line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--purge-artifacts", action="store_true")
+    parser.add_argument(
+        "--scala-version",
+        type=str,
+        required=True,
+        help="Version of Scala to use in the JVM packages",
+        choices=["2.12", "2.13"],
+    )
+    parsed_args = parser.parse_args()
+    main(parsed_args)
--- a/dev/prepare_jvm_release.py
+++ b/dev/prepare_jvm_release.py
@ -2,7 +2,6 @@ import argparse
 import errno
 import glob
 import os
-import platform
 import re
 import shutil
 import subprocess
@ -21,12 +20,14 @@ def normpath(path):
    else:
        return normalized

+
 def cp(source, target):
    source = normpath(source)
    target = normpath(target)
    print("cp {0} {1}".format(source, target))
    shutil.copy(source, target)

+
 def maybe_makedirs(path):
    path = normpath(path)
    print("mkdir -p " + path)
@ -36,6 +37,7 @@ def maybe_makedirs(path):
        if e.errno != errno.EEXIST:
            raise

+
@contextmanager
 def cd(path):
    path = normpath(path)
@ -47,18 +49,22 @@ def cd(path):
    finally:
        os.chdir(cwd)

+
 def run(command, **kwargs):
    print(command)
    subprocess.check_call(command, shell=True, **kwargs)

+
 def get_current_git_tag():
    out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"])
    return out.decode().split("\n")[0]

+
 def get_current_commit_hash():
    out = subprocess.check_output(["git", "rev-parse", "HEAD"])
    return out.decode().split("\n")[0]

+
 def get_current_git_branch():
    out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"])
    m = re.search(r"release_[0-9\.]+", out.decode())
@ -66,38 +72,49 @@ def get_current_git_branch():
        raise ValueError("Expected branch name of form release_xxx")
    return m.group(0)

+
 def retrieve(url, filename=None):
    print(f"{url} -> {filename}")
    return urlretrieve(url, filename)

+
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--release-version", type=str, required=True,
-                        help="Version of the release being prepared")
+    parser.add_argument(
+        "--release-version",
+        type=str,
+        required=True,
+        help="Version of the release being prepared",
+    )
    args = parser.parse_args()
-
-    if sys.platform != "darwin" or platform.machine() != "x86_64":
-        raise NotImplementedError("Please run this script using an Intel Mac")
-
    version = args.release_version
    expected_git_tag = "v" + version
    current_git_tag = get_current_git_tag()
    if current_git_tag != expected_git_tag:
        if not current_git_tag:
-            raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
-                             f"Run: git checkout {expected_git_tag}")
-        raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD is at tag "
-                         f"{current_git_tag}. Run: git checkout {expected_git_tag}")
+            raise ValueError(
+                f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
+                f"Run: git checkout {expected_git_tag}"
+            )
+        raise ValueError(
+            f"Expected git tag {expected_git_tag} but current HEAD is at tag "
+            f"{current_git_tag}. Run: git checkout {expected_git_tag}"
+        )

    commit_hash = get_current_commit_hash()
    git_branch = get_current_git_branch()
-    print(f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}")
+    print(
+        f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}"
+    )

    with cd("jvm-packages/"):
        print("====copying pure-Python tracker====")
        for use_cuda in [True, False]:
            xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
-            cp("../python-package/xgboost/tracker.py", f"{xgboost4j}/src/main/resources")
+            cp(
+                "../python-package/xgboost/tracker.py",
+                f"{xgboost4j}/src/main/resources",
+            )

        print("====copying resources for testing====")
        with cd("../demo/CLI/regression"):
@ -115,7 +132,12 @@ def main():
                cp(file, f"{xgboost4j_spark}/src/test/resources")

        print("====Creating directories to hold native binaries====")
-        for os_ident, arch in [("linux", "x86_64"), ("windows", "x86_64"), ("macos", "x86_64")]:
+        for os_ident, arch in [
+            ("linux", "x86_64"),
+            ("windows", "x86_64"),
+            ("macos", "x86_64"),
+            ("macos", "aarch64"),
+        ]:
            output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
            maybe_makedirs(output_dir)
        for os_ident, arch in [("linux", "x86_64")]:
@ -123,52 +145,98 @@ def main():
            maybe_makedirs(output_dir)

        print("====Downloading native binaries from CI====")
-        nightly_bucket_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
-        maven_repo_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
+        nightly_bucket_prefix = (
+            "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
+        )
+        maven_repo_prefix = (
+            "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
+        )

-        retrieve(url=f"{nightly_bucket_prefix}/{git_branch}/xgboost4j_{commit_hash}.dll",
-                 filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll")
+        retrieve(
+            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll",
+            filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
+        )
+        retrieve(
+            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
+            filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
+        )
+        retrieve(
+            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
+            filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
+        )

        with tempfile.TemporaryDirectory() as tempdir:
            # libxgboost4j.so for Linux x86_64, CPU only
            zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar")
            extract_dir = os.path.join(tempdir, "xgboost4j")
-            retrieve(url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
+            retrieve(
+                url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
                f"xgboost4j_2.12-{version}.jar",
-                     filename=zip_path)
+                filename=zip_path,
+            )
            os.mkdir(extract_dir)
            with zipfile.ZipFile(zip_path, "r") as t:
                t.extractall(extract_dir)
-            cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
-               "xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
+            cp(
+                os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
+                "xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
+            )

            # libxgboost4j.so for Linux x86_64, GPU support
            zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
            extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
-            retrieve(url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
+            retrieve(
+                url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
                f"xgboost4j-gpu_2.12-{version}.jar",
-                     filename=zip_path)
+                filename=zip_path,
+            )
            os.mkdir(extract_dir)
            with zipfile.ZipFile(zip_path, "r") as t:
                t.extractall(extract_dir)
-            cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
-               "xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
-
+            cp(
+                os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
+                "xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
+            )

    print("====Next Steps====")
    print("1. Gain upload right to Maven Central repo.")
    print("1-1. Sign up for a JIRA account at Sonatype: ")
-    print("1-2. File a JIRA ticket: "
+    print(
+        "1-2. File a JIRA ticket: "
        "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
-          "https://issues.sonatype.org/browse/OSSRH-67724")
-    print("2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
-          "https://central.sonatype.org/publish/publish-maven/")
-    print("3. Now on a Mac machine, run:")
-    print("   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests")
-    print("4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
-          "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 "
+        "https://issues.sonatype.org/browse/OSSRH-67724"
+    )
+    print(
+        "2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
+        "https://central.sonatype.org/publish/publish-maven/"
+    )
+    print(
+        "3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
+        "Make sure to use an Internet connection with fast upload speed:"
+    )
+    print(
+        "   # Skip native build, since we have all needed native binaries from CI\n"
+        "   export MAVEN_SKIP_NATIVE_BUILD=1\n"
+        "   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
+    )
+    print(
+        "4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
+        "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
        "to inspect the staged JAR files. Finally, press Release button to publish the "
-          "artifacts to the Maven Central repository.")
+        "artifacts to the Maven Central repository. The top-level metapackage should be "
+        "named xgboost-jvm_2.12."
+    )
+    print(
+        "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
+        "   export MAVEN_SKIP_NATIVE_BUILD=1\n"
+        "   python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
+        "   GPG_TTY=$(tty) mvn deploy -Prelease-cpu-only,scala-2.13 -DskipTests"
+    )
+    print(
+        "6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
+        "The top-level metapackage should be named xgboost-jvm_2.13."
+    )
+

 if __name__ == "__main__":
    main()
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -329,7 +329,7 @@ Parameters for Linear Booster (``booster=gblinear``)
  - Choice of algorithm to fit linear model

    - ``shotgun``: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run.
-    - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution.
+    - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the ``device`` parameter is set to ``cuda`` or ``gpu``, a GPU variant would be used.

 * ``feature_selector`` [default= ``cyclic``]

--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@ -58,19 +58,19 @@
 /*!
 * \brief Tag function as usable by device
 */
-#if defined (__CUDA__) || defined(__NVCC__)
+#if defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
 #define XGBOOST_DEVICE __host__ __device__
 #else
 #define XGBOOST_DEVICE
-#endif  // defined (__CUDA__) || defined(__NVCC__)
+#endif  // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)

-#if defined(__CUDA__) || defined(__CUDACC__)
+#if defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
 #define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
 #define XGBOOST_DEV_INLINE __device__ __forceinline__
 #else
 #define XGBOOST_HOST_DEV_INLINE
 #define XGBOOST_DEV_INLINE
-#endif  // defined(__CUDA__) || defined(__CUDACC__)
+#endif  // defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)

 // These check are for Makefile.
 #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
@ -274,8 +274,8 @@ class GradientPairInt64 {
  GradientPairInt64(GradientPairInt64 const &g) = default;
  GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;

-  XGBOOST_DEVICE [[nodiscard]] T GetQuantisedGrad() const { return grad_; }
-  XGBOOST_DEVICE [[nodiscard]] T GetQuantisedHess() const { return hess_; }
+  [[nodiscard]] XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
+  [[nodiscard]] XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }

  XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
    grad_ += rhs.grad_;
--- a/include/xgboost/global_config.h
+++ b/include/xgboost/global_config.h
@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
 * \file global_config.h
 * \brief Global configuration for XGBoost
 * \author Hyunsu Cho
@ -7,24 +7,22 @@
 #ifndef XGBOOST_GLOBAL_CONFIG_H_
 #define XGBOOST_GLOBAL_CONFIG_H_

-#include <xgboost/parameter.h>
-#include <vector>
-#include <string>
+#include <dmlc/thread_local.h>  // for ThreadLocalStore
+#include <xgboost/parameter.h>  // for XGBoostParameter
+
+#include <cstdint>  // for int32_t

 namespace xgboost {
-class Json;
-
 struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
-  int verbosity { 1 };
-  bool use_rmm { false };
+  std::int32_t verbosity{1};
+  bool use_rmm{false};
  DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
    DMLC_DECLARE_FIELD(verbosity)
        .set_range(0, 3)
        .set_default(1)  // shows only warning
        .describe("Flag to print out detailed breakdown of runtime.");
-    DMLC_DECLARE_FIELD(use_rmm)
-        .set_default(false)
-        .describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
+    DMLC_DECLARE_FIELD(use_rmm).set_default(false).describe(
+        "Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
  }
 };

--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@ -58,11 +58,11 @@

 namespace xgboost {

-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 // Sets a function to call instead of cudaSetDevice();
 // only added for testing
 void SetCudaSetDeviceHandler(void (*handler)(int));
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__

 template <typename T> struct HostDeviceVectorImpl;

--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -30,11 +30,11 @@

 // decouple it from xgboost.
 #ifndef LINALG_HD
-#if defined(__CUDA__) || defined(__NVCC__)
+#if defined(__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
 #define LINALG_HD __host__ __device__
 #else
 #define LINALG_HD
-#endif  // defined (__CUDA__) || defined(__NVCC__)
+#endif  // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
 #endif  // LINALG_HD

 namespace xgboost::linalg {
@ -118,7 +118,7 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,

 template <int32_t n, typename Fn>
 LINALG_HD constexpr auto UnrollLoop(Fn fn) {
-#if defined __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #pragma unroll n
 #endif  // defined __CUDA_ARCH__
  for (int32_t i = 0; i < n; ++i) {
@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
 inline LINALG_HD int Popc(uint32_t v) {
 #if defined(__CUDA_ARCH__)
  return __popc(v);
-#elif defined(__GNUC__) || defined(__clang__)
+#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
  return __builtin_popcount(v);
 #elif defined(_MSC_VER)
  return __popcnt(v);
@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
 inline LINALG_HD int Popc(uint64_t v) {
 #if defined(__CUDA_ARCH__)
  return __popcll(v);
-#elif defined(__GNUC__) || defined(__clang__)
+#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
  return __builtin_popcountll(v);
 #elif defined(_MSC_VER) && defined(_M_X64)
  return __popcnt64(v);
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@ -41,7 +41,9 @@

 #if defined(__CUDACC__)
 #include <cuda_runtime.h>
-#endif  // defined(__CUDACC__)
+#elif defined(__HIPCC__)
+#include <hip/hip_runtime.h>
+#endif

 /*!
 * The version number 1910 is picked up from GSL.
@ -104,7 +106,42 @@ namespace common {

 #define SPAN_CHECK KERNEL_CHECK

-#else  // ------------------------------ not CUDA ----------------------------
+#elif defined(__HIPCC__)
+// Usual logging facility is not available inside device code.
+
+#if defined(_MSC_VER)
+
+// Windows HIP doesn't have __assert_fail.
+#define HIP_KERNEL_CHECK(cond)           \
+  do {                                    \
+    if (XGBOOST_EXPECT(!(cond), false)) { \
+      __builtin_trap();                           \
+    }                                     \
+  } while (0)
+
+#else  // defined(_MSC_VER)
+
+#define __ASSERT_STR_HELPER(x) #x
+
+#if 0
+#define HIP_KERNEL_CHECK(cond)  \
+  (XGBOOST_EXPECT((cond), true) \
+       ? static_cast<void>(0)   \
+       : __assert_fail(__ASSERT_STR_HELPER((cond)), __FILE__, __LINE__, __PRETTY_FUNCTION__))
+#else
+#define HIP_KERNEL_CHECK(cond)  \
+  (XGBOOST_EXPECT((cond), true) \
+       ? static_cast<void>(0)   \
+       : __builtin_trap())
+#endif
+
+#endif  // defined(_MSC_VER)
+
+#define KERNEL_CHECK HIP_KERNEL_CHECK
+
+#define SPAN_CHECK KERNEL_CHECK
+
+#else  // ------------------------------ not CUDA or HIP ----------------------------

 #if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1

@ -120,7 +157,7 @@ namespace common {

 #endif  // defined(XGBOOST_STRICT_R_MODE)

-#endif  // __CUDA_ARCH__
+#endif  // __CUDA_ARCH__ || __HIPCC__

 #define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs))

@ -317,7 +354,7 @@ struct IsSpanOracle<Span<T, Extent>> : std::true_type {};
 template <class T>
 struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {};

-// Re-implement std algorithms here to adopt CUDA.
+// Re-implement std algorithms here to adopt CUDA/HIP
 template <typename T>
 struct Less {
  XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const {
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@ -6,6 +6,6 @@

 #define XGBOOST_VER_MAJOR 2  /* NOLINT */
 #define XGBOOST_VER_MINOR 0  /* NOLINT */
-#define XGBOOST_VER_PATCH 0  /* NOLINT */
+#define XGBOOST_VER_PATCH 3  /* NOLINT */

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@ -9,6 +9,11 @@ if (USE_CUDA)
    ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
 endif (USE_CUDA)

+if (USE_HIP)
+  list(APPEND JVM_SOURCES
+    ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip)
+endif (USE_HIP)
+
 add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})

 if (ENABLE_ALL_WARNINGS)
@ -25,4 +30,3 @@ target_include_directories(xgboost4j
  ${PROJECT_SOURCE_DIR}/rabit/include)

 set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
-target_link_libraries(xgboost4j PRIVATE ${JAVA_JVM_LIBRARY})
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
-import errno
 import argparse
+import errno
 import glob
 import os
 import platform
@ -19,11 +19,12 @@ CONFIG = {
    "USE_HDFS": "OFF",
    "USE_AZURE": "OFF",
    "USE_S3": "OFF",
-
    "USE_CUDA": "OFF",
    "USE_NCCL": "OFF",
+    "USE_HIP": "OFF",
+    "USE_RCCL": "OFF",
    "JVM_BINDINGS": "ON",
-    "LOG_CAPI_INVOCATION": "OFF"
+    "LOG_CAPI_INVOCATION": "OFF",
 }


@ -70,26 +71,22 @@ def normpath(path):
        return normalized


-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
-    parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
-    cli_args = parser.parse_args()
-
+def native_build(args):
    if sys.platform == "darwin":
        # Enable of your compiler supports OpenMP.
        CONFIG["USE_OPENMP"] = "OFF"
-        os.environ["JAVA_HOME"] = subprocess.check_output(
-            "/usr/libexec/java_home").strip().decode()
+        os.environ["JAVA_HOME"] = (
+            subprocess.check_output("/usr/libexec/java_home").strip().decode()
+        )

    print("building Java wrapper")
    with cd(".."):
-        build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build'
+        build_dir = "build-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "build"
        maybe_makedirs(build_dir)
        with cd(build_dir):
            if sys.platform == "win32":
                # Force x64 build on Windows.
-                maybe_generator = ' -A x64'
+                maybe_generator = " -A x64"
            else:
                maybe_generator = ""
            if sys.platform == "linux":
@ -97,12 +94,15 @@ if __name__ == "__main__":
            else:
                maybe_parallel_build = ""

-            if cli_args.log_capi_invocation == 'ON':
-                CONFIG['LOG_CAPI_INVOCATION'] = 'ON'
+            if cli_args.log_capi_invocation == "ON":
+                CONFIG["LOG_CAPI_INVOCATION"] = "ON"

-            if cli_args.use_cuda == 'ON':
-                CONFIG['USE_CUDA'] = 'ON'
-                CONFIG['USE_NCCL'] = 'ON'
+            if cli_args.use_cuda == "ON":
+                CONFIG["USE_CUDA"] = "ON"
+                CONFIG["USE_NCCL"] = "ON"
+            elif cli_args.use_hip == "ON":
+                CONFIG["USE_HIP"] = "ON"
+                CONFIG["USE_RCCL"] = "ON"

            args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]

@ -115,7 +115,7 @@ if __name__ == "__main__":
            if gpu_arch_flag is not None:
                args.append("%s" % gpu_arch_flag)

-            lib_dir = os.path.join(os.pardir, 'lib')
+            lib_dir = os.path.join(os.pardir, "lib")
            if os.path.exists(lib_dir):
                shutil.rmtree(lib_dir)
            run("cmake .. " + " ".join(args) + maybe_generator)
@ -125,8 +125,10 @@ if __name__ == "__main__":
            run(f'"{sys.executable}" mapfeat.py')
            run(f'"{sys.executable}" mknfold.py machine.txt 1')

-    xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j'
-    xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark'
+    xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "xgboost4j"
+    xgboost4j_spark = (
+        "xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "xgboost4j-spark"
+    )

    print("copying native library")
    library_name, os_folder = {
@ -141,14 +143,19 @@ if __name__ == "__main__":
        "i86pc": "x86_64",  # on Solaris x86_64
        "sun4v": "sparc",  # on Solaris sparc
        "arm64": "aarch64",  # on macOS & Windows ARM 64-bit
-        "aarch64": "aarch64"
+        "aarch64": "aarch64",
    }[platform.machine().lower()]
-    output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder)
+    output_folder = "{}/src/main/resources/lib/{}/{}".format(
+        xgboost4j, os_folder, arch_folder
+    )
    maybe_makedirs(output_folder)
    cp("../lib/" + library_name, output_folder)

    print("copying pure-Python tracker")
-    cp("../python-package/xgboost/tracker.py", "{}/src/main/resources".format(xgboost4j))
+    cp(
+        "../python-package/xgboost/tracker.py",
+        "{}/src/main/resources".format(xgboost4j),
+    )

    print("copying train/test files")
    maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
@ -164,3 +171,18 @@ if __name__ == "__main__":
    maybe_makedirs("{}/src/test/resources".format(xgboost4j))
    for file in glob.glob("../demo/data/agaricus.*"):
        cp(file, "{}/src/test/resources".format(xgboost4j))
+
+
+if __name__ == "__main__":
+    if "MAVEN_SKIP_NATIVE_BUILD" in os.environ:
+        print("MAVEN_SKIP_NATIVE_BUILD is set. Skipping native build...")
+    else:
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--log-capi-invocation", type=str, choices=["ON", "OFF"], default="OFF"
+        )
+        parser.add_argument(
+            "--use-cuda", type=str, choices=["ON", "OFF"], default="OFF"
+        )
+        cli_args = parser.parse_args()
+        native_build(cli_args)
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -5,8 +5,8 @@
    <modelVersion>4.0.0</modelVersion>

    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost-jvm</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <artifactId>xgboost-jvm_2.12</artifactId>
+    <version>2.0.3</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
@ -43,6 +43,7 @@
        <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
        <log.capi.invocation>OFF</log.capi.invocation>
        <use.cuda>OFF</use.cuda>
+        <use.hip>OFF</use.hip>
        <cudf.version>23.08.0</cudf.version>
        <spark.rapids.version>23.08.0</spark.rapids.version>
        <cudf.classifier>cuda11</cudf.classifier>
@ -189,6 +190,93 @@
                </plugins>
            </build>
        </profile>
+        <profile>
+            <id>release-cpu-only</id>
+            <modules>
+                <module>xgboost4j</module>
+                <module>xgboost4j-example</module>
+                <module>xgboost4j-spark</module>
+                <module>xgboost4j-flink</module>
+            </modules>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-jar-plugin</artifactId>
+                        <version>3.3.0</version>
+                        <executions>
+                            <execution>
+                                <id>empty-javadoc-jar</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>jar</goal>
+                                </goals>
+                                <configuration>
+                                    <classifier>javadoc</classifier>
+                                    <classesDirectory>${basedir}/javadoc</classesDirectory>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-release-plugin</artifactId>
+                        <version>3.0.1</version>
+                        <configuration>
+                            <autoVersionSubmodules>true</autoVersionSubmodules>
+                            <useReleaseProfile>false</useReleaseProfile>
+                            <releaseProfiles>release</releaseProfiles>
+                            <goals>deploy</goals>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-gpg-plugin</artifactId>
+                        <version>3.1.0</version>
+                        <executions>
+                            <execution>
+                                <id>sign-artifacts</id>
+                                <phase>verify</phase>
+                                <goals>
+                                    <goal>sign</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-source-plugin</artifactId>
+                        <version>3.3.0</version>
+                        <executions>
+                            <execution>
+                                <id>attach-sources</id>
+                                <goals>
+                                    <goal>jar-no-fork</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.sonatype.plugins</groupId>
+                        <artifactId>nexus-staging-maven-plugin</artifactId>
+                        <version>1.6.13</version>
+                        <extensions>true</extensions>
+                        <configuration>
+                            <serverId>ossrh</serverId>
+                            <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+                            <autoReleaseAfterClose>false</autoReleaseAfterClose>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-surefire-plugin</artifactId>
+                        <configuration>
+                            <skipTests>true</skipTests>
+                        </configuration>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
        <profile>
            <id>assembly</id>
            <build>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@ -5,12 +5,12 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <artifactId>xgboost-jvm_2.12</artifactId>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-example</name>
-    <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <artifactId>xgboost4j-example_2.12</artifactId>
+    <version>2.0.3</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@ -26,7 +26,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-spark_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
@ -37,7 +37,7 @@
        </dependency>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-flink_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
    </dependencies>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@ -5,13 +5,13 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <artifactId>xgboost-jvm_2.12</artifactId>
+        <version>2.0.3</version>
    </parent>

    <name>xgboost4j-flink</name>
-    <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <artifactId>xgboost4j-flink_2.12</artifactId>
+    <version>2.0.3</version>
    <properties>
      <flink-ml.version>2.2.0</flink-ml.version>
    </properties>
@ -30,7 +30,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -5,12 +5,12 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <artifactId>xgboost-jvm_2.12</artifactId>
+        <version>2.0.3</version>
    </parent>
-    <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-gpu_2.12</artifactId>
    <name>xgboost4j-gpu</name>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.0.3</version>
    <packaging>jar</packaging>

    <dependencies>
@ -104,6 +104,8 @@
                                <argument>${log.capi.invocation}</argument>
                                <argument>--use-cuda</argument>
                                <argument>${use.cuda}</argument>
+                                <argument>--use-hip</argument>
+                                <argument>${use.hip}</argument>
                            </arguments>
                            <workingDirectory>${user.dir}</workingDirectory>
                        </configuration>
--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp
@ -2,7 +2,7 @@
 // Created by bobwang on 2021/9/8.
 //

-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 #include <jni.h>

--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
@ -1,6 +1,10 @@
 #include <jni.h>

+#if defined(XGBOOST_USE_CUDA)
 #include "../../../../src/common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../../../../src/common/device_helpers.hip.h"
+#endif
 #include "../../../../src/common/cuda_pinned_allocator.h"
 #include "../../../../src/data/array_interface.h"
 #include "jvm_utils.h"
--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "xgboost4j-gpu.cu"
+#endif
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@ -5,11 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <artifactId>xgboost-jvm_2.12</artifactId>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-spark-gpu</name>
-    <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
        <plugins>
            <plugin>
@ -24,7 +24,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-gpu_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@ -5,11 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <artifactId>xgboost-jvm_2.12</artifactId>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-spark</name>
-    <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
        <plugins>
            <plugin>
@ -24,7 +24,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -5,12 +5,12 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <artifactId>xgboost-jvm_2.12</artifactId>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j</name>
-    <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <artifactId>xgboost4j_2.12</artifactId>
+    <version>2.0.3</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/python-package/packager/build_config.py
+++ b/python-package/packager/build_config.py
@ -15,6 +15,10 @@ class BuildConfiguration:  # pylint: disable=R0902
    use_cuda: bool = False
    # Whether to enable NCCL
    use_nccl: bool = False
+    # Whether to enablea HIP 
+    use_hip: bool = False
+    # Whether to enable RCCL
+    use_rccl: bool = False
    # Whether to enable HDFS
    use_hdfs: bool = False
    # Whether to enable Azure Storage
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@ -132,17 +132,29 @@ def locate_or_build_libxgboost(

    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
-        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
-        if not libxgboost_sys.exists():
+        sys_prefix = pathlib.Path(sys.base_prefix)
+        sys_prefix_candidates = [
+            sys_prefix / "lib",
+            # Paths possibly used on Windows
+            sys_prefix / "bin",
+            sys_prefix / "Library",
+            sys_prefix / "Library" / "bin",
+            sys_prefix / "Library" / "lib",
+        ]
+        sys_prefix_candidates = [
+            p.expanduser().resolve() for p in sys_prefix_candidates
+        ]
+        for candidate_dir in sys_prefix_candidates:
+            libtreelite_sys = candidate_dir / _lib_name()
+            if libtreelite_sys.exists():
+                logger.info("Using system XGBoost: %s", str(libtreelite_sys))
+                return libtreelite_sys
        raise RuntimeError(
            f"use_system_libxgboost was specified but {_lib_name()} is "
-                f"not found in {libxgboost_sys.parent}"
+            f"not found. Paths searched (in order): \n"
+            + "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
        )

-        logger.info("Using system XGBoost: %s", str(libxgboost_sys))
-        return libxgboost_sys
-
    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
    if libxgboost is not None:
        return libxgboost
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@ -7,7 +7,7 @@ build-backend = "packager.pep517"

 [project]
 name = "xgboost"
-version = "2.0.0-dev"
+version = "2.0.3"
 authors = [
    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@ -1 +1 @@
-2.0.0-dev
+2.0.3
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@ -88,6 +88,18 @@ def is_cudf_available() -> bool:
        return False


+def is_cupy_available() -> bool:
+    """Check cupy package available or not"""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+
+        return True
+    except ImportError:
+        return False
+
+
 try:
    import scipy.sparse as scipy_sparse
    from scipy.sparse import csr_matrix as scipy_csr
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -206,6 +206,7 @@ def _load_lib() -> ctypes.CDLL:
            lib = ctypes.cdll.LoadLibrary(lib_path)
            setattr(lib, "path", os.path.normpath(lib_path))
            lib_success = True
+            break
        except OSError as e:
            os_error_list.append(str(e))
            continue
@ -2399,6 +2400,7 @@ class Booster:
            _is_cudf_df,
            _is_cupy_array,
            _is_list,
+            _is_np_array_like,
            _is_pandas_df,
            _is_pandas_series,
            _is_tuple,
@ -2428,7 +2430,7 @@ class Booster:
                    f"got {data.shape[1]}"
                )

-        if isinstance(data, np.ndarray):
+        if _is_np_array_like(data):
            from .data import _ensure_np_dtype

            data, _ = _ensure_np_dtype(data, data.dtype)
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@ -78,7 +78,6 @@ from .data import _is_cudf_ser, _is_cupy_array
 from .sklearn import (
    XGBClassifier,
    XGBClassifierBase,
-    XGBClassifierMixIn,
    XGBModel,
    XGBRanker,
    XGBRankerMixIn,
@ -1854,7 +1853,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
    "Implementation of the scikit-learn API for XGBoost classification.",
    ["estimators", "model"],
 )
-class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase):
+class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
    # pylint: disable=missing-class-docstring
    async def _fit_async(
        self,
@ -2036,10 +2035,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
            preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
        return preds

-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-

@xgboost_model_doc(
    """Implementation of the Scikit-Learn API for XGBoost Ranking.
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -164,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
    return isinstance(data, scipy.sparse.coo_matrix)


-def _is_numpy_array(data: DataType) -> bool:
-    return isinstance(data, (np.ndarray, np.matrix))
+def _is_np_array_like(data: DataType) -> bool:
+    return hasattr(data, "__array_interface__")


 def _ensure_np_dtype(
@ -317,7 +317,6 @@ def pandas_feature_info(
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
    """Handle feature info for pandas dataframe."""
    import pandas as pd
-    from pandas.api.types import is_categorical_dtype, is_sparse

    # handle feature names
    if feature_names is None and meta is None:
@ -332,10 +331,10 @@ def pandas_feature_info(
    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data.dtypes:
-            if is_sparse(dtype):
+            if is_pd_sparse_dtype(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif (
-                is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
            ) and enable_categorical:
                feature_types.append(CAT_T)
            else:
@ -345,18 +344,13 @@ def pandas_feature_info(

 def is_nullable_dtype(dtype: PandasDType) -> bool:
    """Whether dtype is a pandas nullable type."""
-    from pandas.api.types import (
-        is_bool_dtype,
-        is_categorical_dtype,
-        is_float_dtype,
-        is_integer_dtype,
-    )
+    from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype

    is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
    is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
-    return is_int or is_bool or is_float or is_categorical_dtype(dtype)
+    return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)


 def is_pa_ext_dtype(dtype: Any) -> bool:
@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
    )


+def is_pd_cat_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas category type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import CategoricalDtype
+
+            return isinstance(dtype, CategoricalDtype)
+
+    from pandas.api.types import is_categorical_dtype
+
+    return is_categorical_dtype(dtype)
+
+
+def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas sparse type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import SparseDtype
+
+            return isinstance(dtype, SparseDtype)
+
+    from pandas.api.types import is_sparse
+
+    return is_sparse(dtype)
+
+
 def pandas_cat_null(data: DataFrame) -> DataFrame:
    """Handle categorical dtype and nullable extension types from pandas."""
    import pandas as pd
-    from pandas.api.types import is_categorical_dtype

    # handle category codes and nullable.
    cat_columns = []
    nul_columns = []
    # avoid an unnecessary conversion if possible
    for col, dtype in zip(data.columns, data.dtypes):
-        if is_categorical_dtype(dtype):
+        if is_pd_cat_dtype(dtype):
            cat_columns.append(col)
        elif is_pa_ext_categorical_dtype(dtype):
            raise ValueError(
@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
        transformed = data

    def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_categorical_dtype(ser.dtype):
+        if is_pd_cat_dtype(ser.dtype):
            return ser.cat.codes
        assert is_pa_ext_categorical_dtype(ser.dtype)
        # Not yet supported, the index is not ordered for some reason. Alternately:
@ -454,14 +479,12 @@ def _transform_pandas_df(
    meta: Optional[str] = None,
    meta_type: Optional[NumpyDType] = None,
 ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
-    from pandas.api.types import is_categorical_dtype, is_sparse
-
    pyarrow_extension = False
    for dtype in data.dtypes:
        if not (
            (dtype.name in _pandas_dtype_mapper)
-            or is_sparse(dtype)
-            or (is_categorical_dtype(dtype) and enable_categorical)
+            or is_pd_sparse_dtype(dtype)
+            or (is_pd_cat_dtype(dtype) and enable_categorical)
            or is_pa_ext_dtype(dtype)
        ):
            _invalid_dataframe_dtype(data)
@ -515,9 +538,8 @@ def _meta_from_pandas_series(
 ) -> None:
    """Help transform pandas series for meta data like labels"""
    data = data.values.astype("float")
-    from pandas.api.types import is_sparse

-    if is_sparse(data):
+    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
        data = data.to_dense()  # type: ignore
    assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
    _meta_from_numpy(data, name, dtype, handle)
@ -539,13 +561,11 @@ def _from_pandas_series(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    from pandas.api.types import is_categorical_dtype
-
    if (data.dtype.name not in _pandas_dtype_mapper) and not (
-        is_categorical_dtype(data.dtype) and enable_categorical
+        is_pd_cat_dtype(data.dtype) and enable_categorical
    ):
        _invalid_dataframe_dtype(data)
-    if enable_categorical and is_categorical_dtype(data.dtype):
+    if enable_categorical and is_pd_cat_dtype(data.dtype):
        data = data.cat.codes
    return _from_numpy_array(
        data.values.reshape(data.shape[0], 1).astype("float"),
@ -1051,7 +1071,7 @@ def dispatch_data_backend(
        return _from_scipy_csr(
            data.tocsr(), missing, threads, feature_names, feature_types
        )
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        return _from_numpy_array(
            data, missing, threads, feature_names, feature_types, data_split_mode
        )
@ -1194,7 +1214,7 @@ def dispatch_meta_backend(
    if _is_tuple(data):
        _meta_from_tuple(data, name, dtype, handle)
        return
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        _meta_from_numpy(data, name, dtype, handle)
        return
    if _is_pandas_df(data):
@ -1281,7 +1301,7 @@ def _proxy_transform(
        return _transform_dlpack(data), None, feature_names, feature_types
    if _is_list(data) or _is_tuple(data):
        data = np.array(data)
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
@ -1331,7 +1351,7 @@ def dispatch_proxy_set_data(
    if not allow_host:
        raise err

-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        _check_data_shape(data)
        proxy._set_data_from_array(data)  # pylint: disable=W0212
        return
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@ -31,16 +31,15 @@ def find_lib_path() -> List[str]:
    ]

    if sys.platform == "win32":
-        if platform.architecture()[0] == "64bit":
-            dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/"))
-            # hack for pip installation when copy all parent source
-            # directory here
-            dll_path.append(os.path.join(curr_path, "./windows/x64/Release/"))
-        else:
-            dll_path.append(os.path.join(curr_path, "../../windows/Release/"))
-            # hack for pip installation when copy all parent source
-            # directory here
-            dll_path.append(os.path.join(curr_path, "./windows/Release/"))
+        # On Windows, Conda may install libs in different paths
+        dll_path.extend(
+            [
+                os.path.join(sys.base_prefix, "bin"),
+                os.path.join(sys.base_prefix, "Library"),
+                os.path.join(sys.base_prefix, "Library", "bin"),
+                os.path.join(sys.base_prefix, "Library", "lib"),
+            ]
+        )
        dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
    elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
        dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train


-class XGBClassifierMixIn:  # pylint: disable=too-few-public-methods
-    """MixIn for classification."""
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-
-    def _load_model_attributes(self, booster: Booster) -> None:
-        config = json.loads(booster.save_config())
-        self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
-        # binary classification is treated as regression in XGBoost.
-        self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
-
-
 class XGBRankerMixIn:  # pylint: disable=too-few-public-methods
    """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
    base classes.
@ -845,9 +832,7 @@ class XGBModel(XGBModelBase):
        self.get_booster().load_model(fname)

        meta_str = self.get_booster().attr("scikit_learn")
-        if meta_str is None:
-            return
-
+        if meta_str is not None:
            meta = json.loads(meta_str)
            t = meta.get("_estimator_type", None)
            if t is not None and t != self._get_type():
@ -855,11 +840,30 @@ class XGBModel(XGBModelBase):
                    "Loading an estimator with different type. Expecting: "
                    f"{self._get_type()}, got: {t}"
                )
+
        self.feature_types = self.get_booster().feature_types
        self.get_booster().set_attr(scikit_learn=None)
+        config = json.loads(self.get_booster().save_config())
+        self._load_model_attributes(config)

    load_model.__doc__ = f"""{Booster.load_model.__doc__}"""

+    def _load_model_attributes(self, config: dict) -> None:
+        """Load model attributes without hyper-parameters."""
+        from sklearn.base import is_classifier
+
+        booster = self.get_booster()
+
+        self.objective = config["learner"]["objective"]["name"]
+        self.booster = config["learner"]["gradient_booster"]["name"]
+        self.base_score = config["learner"]["learner_model_param"]["base_score"]
+        self.feature_types = booster.feature_types
+
+        if is_classifier(self):
+            self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
+            # binary classification is treated as regression in XGBoost.
+            self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
+
    # pylint: disable=too-many-branches
    def _configure_fit(
        self,
@ -1409,7 +1413,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
        Number of boosting rounds.
 """,
 )
-class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
+class XGBClassifier(XGBModel, XGBClassifierBase):
    # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
    @_deprecate_positional_args
    def __init__(
@ -1637,10 +1641,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    def classes_(self) -> np.ndarray:
        return np.arange(self.n_classes_)

-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-

@xgboost_model_doc(
    "scikit-learn API for XGBoost random forest classification.",
@ -2093,7 +2093,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):

        """
        X, qid = _get_qid(X, None)
-        Xyq = DMatrix(X, y, qid=qid)
+        # fixme(jiamingy): base margin and group weight is not yet supported. We might
+        # need to make extra special fields in the dataframe.
+        Xyq = DMatrix(
+            X,
+            y,
+            qid=qid,
+            missing=self.missing,
+            enable_categorical=self.enable_categorical,
+            nthread=self.n_jobs,
+            feature_types=self.feature_types,
+        )
        if callable(self.eval_metric):
            metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
            result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@ -22,7 +22,7 @@ from typing import (

 import numpy as np
 import pandas as pd
-from pyspark import SparkContext, cloudpickle
+from pyspark import RDD, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@ -44,6 +44,7 @@ from pyspark.ml.util import (
    MLWritable,
    MLWriter,
 )
+from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
 from pyspark.sql import Column, DataFrame
 from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
 from pyspark.sql.types import (
@ -59,11 +60,12 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module

 import xgboost
 from xgboost import XGBClassifier
-from xgboost.compat import is_cudf_available
+from xgboost.compat import is_cudf_available, is_cupy_available
 from xgboost.core import Booster, _check_distributed_params
 from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train

+from .._typing import ArrayLike
 from .data import (
    _read_csr_matrix_from_unwrapped_spark_vec,
    alias,
@ -87,6 +89,7 @@ from .utils import (
    _get_rabit_args,
    _get_spark_session,
    _is_local,
+    _is_standalone_or_localcluster,
    deserialize_booster,
    deserialize_xgb_model,
    get_class_name,
@ -241,6 +244,13 @@ class _SparkXGBParams(
        TypeConverters.toList,
    )

+    def set_device(self, value: str) -> "_SparkXGBParams":
+        """Set device, optional value: cpu, cuda, gpu"""
+        _check_distributed_params({"device": value})
+        assert value in ("cpu", "cuda", "gpu")
+        self.set(self.device, value)
+        return self
+
    @classmethod
    def _xgb_cls(cls) -> Type[XGBModel]:
        """
@ -334,6 +344,54 @@ class _SparkXGBParams(
                predict_params[param.name] = self.getOrDefault(param)
        return predict_params

+    def _validate_gpu_params(self) -> None:
+        """Validate the gpu parameters and gpu configurations"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if _is_local(sc):
+                # Support GPU training in Spark local mode is just for debugging
+                # purposes, so it's okay for printing the below warning instead of
+                # checking the real gpu numbers and raising the exception.
+                get_logger(self.__class__.__name__).warning(
+                    "You have enabled GPU in spark local mode. Please make sure your"
+                    " local node has at least %d GPUs",
+                    self.getOrDefault(self.num_workers),
+                )
+            else:
+                executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+                if executor_gpus is None:
+                    raise ValueError(
+                        "The `spark.executor.resource.gpu.amount` is required for training"
+                        " on GPU."
+                    )
+
+                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
+                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
+                    # require spark.task.resource.gpu.amount to be set explicitly
+                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
+                    if gpu_per_task is not None:
+                        if float(gpu_per_task) < 1.0:
+                            raise ValueError(
+                                "XGBoost doesn't support GPU fractional configurations. "
+                                "Please set `spark.task.resource.gpu.amount=spark.executor"
+                                ".resource.gpu.amount`"
+                            )
+
+                        if float(gpu_per_task) > 1.0:
+                            get_logger(self.__class__.__name__).warning(
+                                "%s GPUs for each Spark task is configured, but each "
+                                "XGBoost training task uses only 1 GPU.",
+                                gpu_per_task,
+                            )
+                    else:
+                        raise ValueError(
+                            "The `spark.task.resource.gpu.amount` is required for training"
+                            " on GPU."
+                        )
+
    def _validate_params(self) -> None:
        # pylint: disable=too-many-branches
        init_model = self.getOrDefault("xgb_model")
@ -413,53 +471,7 @@ class _SparkXGBParams(
                    "`pyspark.ml.linalg.Vector` type."
                )

-        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
-            gpu_per_task = (
-                _get_spark_session()
-                .sparkContext.getConf()
-                .get("spark.task.resource.gpu.amount")
-            )
-
-            is_local = _is_local(_get_spark_session().sparkContext)
-
-            if is_local:
-                # checking spark local mode.
-                if gpu_per_task is not None:
-                    raise RuntimeError(
-                        "The spark local mode does not support gpu configuration."
-                        "Please remove spark.executor.resource.gpu.amount and "
-                        "spark.task.resource.gpu.amount"
-                    )
-
-                # Support GPU training in Spark local mode is just for debugging
-                # purposes, so it's okay for printing the below warning instead of
-                # checking the real gpu numbers and raising the exception.
-                get_logger(self.__class__.__name__).warning(
-                    "You have enabled GPU in spark local mode. Please make sure your"
-                    " local node has at least %d GPUs",
-                    self.getOrDefault(self.num_workers),
-                )
-            else:
-                # checking spark non-local mode.
-                if gpu_per_task is not None:
-                    if float(gpu_per_task) < 1.0:
-                        raise ValueError(
-                            "XGBoost doesn't support GPU fractional configurations. "
-                            "Please set `spark.task.resource.gpu.amount=spark.executor"
-                            ".resource.gpu.amount`"
-                        )
-
-                    if float(gpu_per_task) > 1.0:
-                        get_logger(self.__class__.__name__).warning(
-                            "%s GPUs for each Spark task is configured, but each "
-                            "XGBoost training task uses only 1 GPU.",
-                            gpu_per_task,
-                        )
-                else:
-                    raise ValueError(
-                        "The `spark.task.resource.gpu.amount` is required for training"
-                        " on GPU."
-                    )
+        self._validate_gpu_params()


 def _validate_and_convert_feature_col_as_float_col_list(
@ -584,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            arbitrary_params_dict={},
        )

+        self.logger = get_logger(self.__class__.__name__)
+
    def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
        """
        Set params for the estimator.
@ -886,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

        return booster_params, train_call_kwargs_params, dmatrix_kwargs

+    def _skip_stage_level_scheduling(self) -> bool:
+        # pylint: disable=too-many-return-statements
+        """Check if stage-level scheduling is not needed,
+        return true to skip stage-level scheduling"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if ss.version < "3.4.0":
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark version 3.4.0+"
+                )
+                return True
+
+            if not _is_standalone_or_localcluster(sc):
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark standalone or "
+                    "local-cluster mode"
+                )
+                return True
+
+            executor_cores = sc.getConf().get("spark.executor.cores")
+            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+            if executor_cores is None or executor_gpus is None:
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores, "
+                    "spark.executor.resource.gpu.amount to be set."
+                )
+                return True
+
+            if int(executor_cores) == 1:
+                # there will be only 1 task running at any time.
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
+                )
+                return True
+
+            if int(executor_gpus) > 1:
+                # For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
+                # to make xgboost run successfully.
+                #
+                self.logger.info(
+                    "Stage-level scheduling in xgboost will not work "
+                    "when spark.executor.resource.gpu.amount>1"
+                )
+                return True
+
+            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
+
+            if task_gpu_amount is None:
+                # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
+                # but with stage-level scheduling, we can make training task grab the gpu.
+                return False
+
+            if float(task_gpu_amount) == float(executor_gpus):
+                # spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
+                # results in only 1 task running at a time, which may cause perf issue.
+                return True
+
+            # We can enable stage-level scheduling
+            return False
+
+        # CPU training doesn't require stage-level scheduling
+        return True
+
+    def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
+        """Try to enable stage-level scheduling"""
+
+        if self._skip_stage_level_scheduling():
+            return rdd
+
+        ss = _get_spark_session()
+
+        # executor_cores will not be None
+        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
+        assert executor_cores is not None
+
+        # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
+        # If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
+        # ETL gpu tasks running alongside training tasks.
+        spark_plugins = ss.conf.get("spark.plugins", " ")
+        assert spark_plugins is not None
+        spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
+        assert spark_rapids_sql_enabled is not None
+
+        task_cores = (
+            int(executor_cores)
+            if "com.nvidia.spark.SQLPlugin" in spark_plugins
+            and "true" == spark_rapids_sql_enabled.lower()
+            else (int(executor_cores) // 2) + 1
+        )
+
+        # Each training task requires cpu cores > total executor cores//2 + 1 which can
+        # make sure the tasks be sent to different executors.
+        #
+        # Please note that we can't use GPU to limit the concurrent tasks because of
+        # https://issues.apache.org/jira/browse/SPARK-45527.
+
+        task_gpus = 1.0
+        treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
+        rp = ResourceProfileBuilder().require(treqs).build
+
+        self.logger.info(
+            "XGBoost training tasks require the resource(cores=%s, gpu=%s).",
+            task_cores,
+            task_gpus,
+        )
+        return rdd.withResources(rp)
+
    def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
        # pylint: disable=too-many-statements, too-many-locals
        self._validate_params()
@ -986,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                )

        def _run_job() -> Tuple[str, str]:
-            ret = (
+            rdd = (
                dataset.mapInPandas(
-                    _train_booster, schema="config string, booster string"  # type: ignore
+                    _train_booster,  # type: ignore
+                    schema="config string, booster string",
                )
                .rdd.barrier()
                .mapPartitions(lambda x: x)
-                .collect()[0]
            )
+            rdd_with_resource = self._try_stage_level_scheduling(rdd)
+            ret = rdd_with_resource.collect()[0]
            return ret[0], ret[1]

        get_logger("XGBoost-PySpark").info(
@ -1117,12 +1243,111 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
            )
        return features_col, feature_col_names

+    def _get_pred_contrib_col_name(self) -> Optional[str]:
+        """Return the pred_contrib_col col name"""
+        pred_contrib_col_name = None
+        if (
+            self.isDefined(self.pred_contrib_col)
+            and self.getOrDefault(self.pred_contrib_col) != ""
+        ):
+            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+
+        return pred_contrib_col_name
+
+    def _out_schema(self) -> Tuple[bool, str]:
+        """Return the bool to indicate if it's a single prediction, true is single prediction,
+        and the returned type of the user-defined function. The value must
+        be a DDL-formatted type string."""
+
+        if self._get_pred_contrib_col_name() is not None:
+            return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+
+        return True, "double"
+
+    def _get_predict_func(self) -> Callable:
+        """Return the true prediction function which will be running on the executor side"""
+
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            data = {}
+            preds = model.predict(
+                X,
+                base_margin=base_margin,
+                validate_features=False,
+                **predict_params,
+            )
+            data[pred.prediction] = pd.Series(preds)
+
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin)
+                data[pred.pred_contrib] = pd.Series(list(contribs))
+                return pd.DataFrame(data=data)
+
+            return data[pred.prediction]
+
+        return _predict
+
+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
+        """Post process of transform"""
+        prediction_col_name = self.getOrDefault(self.predictionCol)
+        single_pred, _ = self._out_schema()
+
+        if single_pred:
+            if prediction_col_name:
+                dataset = dataset.withColumn(prediction_col_name, pred_col)
+        else:
+            pred_struct_col = "_prediction_struct"
+            dataset = dataset.withColumn(pred_struct_col, pred_col)
+
+            if prediction_col_name:
+                dataset = dataset.withColumn(
+                    prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
+                )
+
+            pred_contrib_col_name = self._get_pred_contrib_col_name()
+            if pred_contrib_col_name is not None:
+                dataset = dataset.withColumn(
+                    pred_contrib_col_name,
+                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
+                )
+
+            dataset = dataset.drop(pred_struct_col)
+        return dataset
+
+    def _gpu_transform(self) -> bool:
+        """If gpu is used to do the prediction, true to gpu prediction"""
+
+        if _is_local(_get_spark_session().sparkContext):
+            # if it's local model, we just use the internal "device"
+            return use_cuda(self.getOrDefault(self.device))
+
+        gpu_per_task = (
+            _get_spark_session()
+            .sparkContext.getConf()
+            .get("spark.task.resource.gpu.amount")
+        )
+
+        # User don't set gpu configurations, just use cpu
+        if gpu_per_task is None:
+            if use_cuda(self.getOrDefault(self.device)):
+                get_logger("XGBoost-PySpark").warning(
+                    "Do the prediction on the CPUs since "
+                    "no gpu configurations are set"
+                )
+            return False
+
+        # User already sets the gpu configurations, we just use the internal "device".
+        return use_cuda(self.getOrDefault(self.device))
+
    def _transform(self, dataset: DataFrame) -> DataFrame:
        # pylint: disable=too-many-statements, too-many-locals
        # Save xgb_sklearn_model and predict_params to be local variable
        # to avoid the `self` object to be pickled to remote.
        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()

        has_base_margin = False
        if (
@ -1137,79 +1362,92 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
        features_col, feature_col_names = self._get_feature_col(dataset)
        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)

-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+        predict_func = self._get_predict_func()

-        single_pred = True
-        schema = "double"
-        if pred_contrib_col_name:
-            single_pred = False
-            schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+        _, schema = self._out_schema()
+
+        is_local = _is_local(_get_spark_session().sparkContext)
+        run_on_gpu = self._gpu_transform()

        @pandas_udf(schema)  # type: ignore
        def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
            assert xgb_sklearn_model is not None
            model = xgb_sklearn_model
+
+            from pyspark import TaskContext
+
+            context = TaskContext.get()
+            assert context is not None
+
+            dev_ordinal = -1
+
+            if is_cudf_available():
+                if is_local:
+                    if run_on_gpu and is_cupy_available():
+                        import cupy as cp  # pylint: disable=import-error
+
+                        total_gpus = cp.cuda.runtime.getDeviceCount()
+                        if total_gpus > 0:
+                            partition_id = context.partitionId()
+                            # For transform local mode, default the dev_ordinal to
+                            # (partition id) % gpus.
+                            dev_ordinal = partition_id % total_gpus
+                elif run_on_gpu:
+                    dev_ordinal = _get_gpu_id(context)
+
+                if dev_ordinal >= 0:
+                    device = "cuda:" + str(dev_ordinal)
+                    get_logger("XGBoost-PySpark").info(
+                        "Do the inference with device: %s", device
+                    )
+                    model.set_params(device=device)
+                else:
+                    get_logger("XGBoost-PySpark").info("Do the inference on the CPUs")
+            else:
+                msg = (
+                    "CUDF is unavailable, fallback the inference on the CPUs"
+                    if run_on_gpu
+                    else "Do the inference on the CPUs"
+                )
+                get_logger("XGBoost-PySpark").info(msg)
+
+            def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
+                """Move the data to gpu if possible"""
+                if dev_ordinal >= 0:
+                    import cudf  # pylint: disable=import-error
+                    import cupy as cp  # pylint: disable=import-error
+
+                    # We must set the device after import cudf, which will change the device id to 0
+                    # See https://github.com/rapidsai/cudf/issues/11386
+                    cp.cuda.runtime.setDevice(dev_ordinal)  # pylint: disable=I1101
+                    df = cudf.DataFrame(data)
+                    del data
+                    return df
+                return data
+
            for data in iterator:
                if enable_sparse_data_optim:
                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
                else:
                    if feature_col_names is not None:
-                        X = data[feature_col_names]
+                        tmp = data[feature_col_names]
                    else:
-                        X = stack_series(data[alias.data])
+                        tmp = stack_series(data[alias.data])
+                    X = to_gpu_if_possible(tmp)

                if has_base_margin:
-                    base_margin = data[alias.margin].to_numpy()
+                    base_margin = to_gpu_if_possible(data[alias.margin])
                else:
                    base_margin = None

-                data = {}
-                preds = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    validate_features=False,
-                    **predict_params,
-                )
-                data[pred.prediction] = pd.Series(preds)
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin)
-                    data[pred.pred_contrib] = pd.Series(list(contribs))
-                    yield pd.DataFrame(data=data)
-                else:
-                    yield data[pred.prediction]
+                yield predict_func(model, X, base_margin)

        if has_base_margin:
            pred_col = predict_udf(struct(*features_col, base_margin_col))
        else:
            pred_col = predict_udf(struct(*features_col))

-        prediction_col_name = self.getOrDefault(self.predictionCol)
-
-        if single_pred:
-            dataset = dataset.withColumn(prediction_col_name, pred_col)
-        else:
-            pred_struct_col = "_prediction_struct"
-            dataset = dataset.withColumn(pred_struct_col, pred_col)
-
-            dataset = dataset.withColumn(
-                prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
-            )
-
-            if pred_contrib_col_name:
-                dataset = dataset.withColumn(
-                    pred_contrib_col_name,
-                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
-                )
-
-            dataset = dataset.drop(pred_struct_col)
-
-        return dataset
+        return self._post_transform(dataset, pred_col)


 class _ClassificationModel(  # pylint: disable=abstract-method
@ -1221,22 +1459,21 @@ class _ClassificationModel(  # pylint: disable=abstract-method
    .. Note:: This API is experimental.
    """

-    def _transform(self, dataset: DataFrame) -> DataFrame:
-        # pylint: disable=too-many-statements, too-many-locals
-        # Save xgb_sklearn_model and predict_params to be local variable
-        # to avoid the `self` object to be pickled to remote.
-        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()
-
-        has_base_margin = False
-        if (
-            self.isDefined(self.base_margin_col)
-            and self.getOrDefault(self.base_margin_col) != ""
-        ):
-            has_base_margin = True
-            base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
-                alias.margin
+    def _out_schema(self) -> Tuple[bool, str]:
+        schema = (
+            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
+            f" {pred.probability} array<double>"
        )
+        if self._get_pred_contrib_col_name() is not None:
+            # We will force setting strict_shape to True when predicting contribs,
+            # So, it will also output 3-D shape result.
+            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
+
+        return False, schema
+
+    def _get_predict_func(self) -> Callable:
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()

        def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
            if margins.ndim == 1:
@ -1251,45 +1488,9 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                class_probs = softmax(raw_preds, axis=1)
            return raw_preds, class_probs

-        features_col, feature_col_names = self._get_feature_col(dataset)
-        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
-
-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
-
-        schema = (
-            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
-            f" {pred.probability} array<double>"
-        )
-        if pred_contrib_col_name:
-            # We will force setting strict_shape to True when predicting contribs,
-            # So, it will also output 3-D shape result.
-            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
-
-        @pandas_udf(schema)  # type: ignore
-        def predict_udf(
-            iterator: Iterator[Tuple[pd.Series, ...]]
-        ) -> Iterator[pd.DataFrame]:
-            assert xgb_sklearn_model is not None
-            model = xgb_sklearn_model
-            for data in iterator:
-                if enable_sparse_data_optim:
-                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
-                else:
-                    if feature_col_names is not None:
-                        X = data[feature_col_names]  # type: ignore
-                    else:
-                        X = stack_series(data[alias.data])
-
-                if has_base_margin:
-                    base_margin = stack_series(data[alias.margin])
-                else:
-                    base_margin = None
-
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
+        ) -> Union[pd.DataFrame, pd.Series]:
            margins = model.predict(
                X,
                base_margin=base_margin,
@ -1308,19 +1509,17 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                pred.probability: pd.Series(list(class_probs)),
            }

-                if pred_contrib_col_name:
+            if pred_contrib_col_name is not None:
                contribs = pred_contribs(model, X, base_margin, strict_shape=True)
                result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))

-                yield pd.DataFrame(data=result)
+            return pd.DataFrame(data=result)

-        if has_base_margin:
-            pred_struct = predict_udf(struct(*features_col, base_margin_col))
-        else:
-            pred_struct = predict_udf(struct(*features_col))
+        return _predict

+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
        pred_struct_col = "_prediction_struct"
-        dataset = dataset.withColumn(pred_struct_col, pred_struct)
+        dataset = dataset.withColumn(pred_struct_col, pred_col)

        raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
        if raw_prediction_col_name:
@ -1342,7 +1541,8 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                array_to_vector(getattr(col(pred_struct_col), pred.probability)),
            )

-        if pred_contrib_col_name:
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+        if pred_contrib_col_name is not None:
            dataset = dataset.withColumn(
                pred_contrib_col_name,
                getattr(col(pred_struct_col), pred.pred_contrib),
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@ -10,7 +10,7 @@ from threading import Thread
 from typing import Any, Callable, Dict, Optional, Set, Type

 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles
+from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession

 from xgboost import Booster, XGBModel, collective
@ -129,7 +129,14 @@ def _is_local(spark_context: SparkContext) -> bool:
    return spark_context._jsc.sc().isLocal()


-def _get_gpu_id(task_context: BarrierTaskContext) -> int:
+def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
+    master = spark_context.getConf().get("spark.master")
+    return master is not None and (
+        master.startswith("spark://") or master.startswith("local-cluster")
+    )
+
+
+def _get_gpu_id(task_context: TaskContext) -> int:
    """Get the gpu id from the task resources"""
    if task_context is None:
        # This is a safety check.
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:

    with pytest.raises(ValueError, match="Either `group` or `qid`."):
        ranker.fit(df, y, eval_set=[(X, y)])
+
+
+def run_ranking_categorical(device: str) -> None:
+    """Test LTR with categorical features."""
+    from sklearn.model_selection import cross_val_score
+
+    X, y = tm.make_categorical(
+        n_samples=512, n_features=10, n_categories=3, onehot=False
+    )
+    rng = np.random.default_rng(1994)
+    qid = rng.choice(3, size=y.shape[0])
+    qid = np.sort(qid)
+    X["qid"] = qid
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+    ltr.fit(X, y)
+    score = ltr.score(X, y)
+    assert score > 0.9
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+
+    # test using the score function inside sklearn.
+    scores = cross_val_score(ltr, X, y)
+    for s in scores:
+        assert s > 0.7
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 2fea6734e83cf147c1bbe580ac4713cd50abcad5
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -16,6 +16,11 @@ if (USE_CUDA)
  target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
 endif (USE_CUDA)

+if (USE_HIP)
+  file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
+  target_sources(objxgboost PRIVATE ${HIP_SOURCES})
+endif (USE_HIP)
+
 target_include_directories(objxgboost
  PRIVATE
  ${xgboost_SOURCE_DIR}/include
@ -33,6 +38,7 @@ msvc_use_static_runtime()

 # This grouping organises source files nicely in visual studio
 auto_source_group("${CUDA_SOURCES}")
+auto_source_group("${HIP_SOURCES}")
 auto_source_group("${CPU_SOURCES}")

 #-- End object library
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -70,12 +70,14 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {

 using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
  auto &info = *p_info;
  info["USE_CUDA"] = Boolean{false};
  info["USE_NCCL"] = Boolean{false};
+  info["USE_HIP"] = Boolean{false};
+  info["USE_RCCL"] = Boolean{false};
  info["USE_RMM"] = Boolean{false};
 }
 }  // namespace xgboost
@ -279,7 +281,7 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
  API_END();
 }

-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *, char const *, DMatrixHandle *) {
  API_BEGIN();
  common::AssertGPUSupport();
@ -1189,7 +1191,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
  API_END();
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *,
                                          DMatrixHandle, xgboost::bst_ulong const **,
                                          xgboost::bst_ulong *, const float **) {
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@ -17,7 +17,11 @@ namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
  auto &info = *p_info;

+#if defined(XGBOOST_USE_CUDA)
  info["USE_CUDA"] = true;
+#elif defined(XGBOOST_USE_HIP)
+  info["USE_HIP"] = true;
+#endif

  std::vector<Json> v{Json{Integer{THRUST_MAJOR_VERSION}}, Json{Integer{THRUST_MINOR_VERSION}},
                      Json{Integer{THRUST_SUBMINOR_VERSION}}};
@ -30,8 +34,13 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_NCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["NCCL_VERSION"] = v;
+#elif defined(XGBOOST_USE_RCCL)
+  info["USE_RCCL"] = Boolean{true};
+  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
+  info["RCCL_VERSION"] = v;
 #else
  info["USE_NCCL"] = Boolean{false};
+  info["USE_RCCL"] = Boolean{false};
 #endif

 #if defined(XGBOOST_USE_RMM)
--- a/src/c_api/c_api.hip
+++ b/src/c_api/c_api.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "c_api.cu"
+#endif
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@ -175,7 +175,7 @@ inline float GetMissing(Json const &config) {

 // Safe guard some global variables from being changed by XGBoost.
 class XGBoostAPIGuard {
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  int32_t device_id_ {0};

  void SetGPUAttribute();
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@ -51,7 +51,7 @@ void Communicator::Init(Json const& config) {
  }
 }

-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void Communicator::Finalize() {
  communicator_->Shutdown();
  communicator_.reset(new NoOpCommunicator());
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@ -5,7 +5,7 @@
 #include "device_communicator.cuh"
 #include "device_communicator_adapter.cuh"
 #include "noop_communicator.h"
-#ifdef XGBOOST_USE_NCCL
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include "nccl_device_communicator.cuh"
 #endif

@ -28,7 +28,7 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
      communicator_->GetWorldSize() != old_world_size) {
    old_device_ordinal = device_ordinal;
    old_world_size = communicator_->GetWorldSize();
-#ifdef XGBOOST_USE_NCCL
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    switch (type_) {
      case CommunicatorType::kRabit:
        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@ -98,7 +98,7 @@ class Communicator {
  /** @brief Get the communicator instance. */
  static Communicator *Get() { return communicator_.get(); }

-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  /**
   * @brief Get the device communicator.
   *
@ -230,7 +230,7 @@ class Communicator {

  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
-#if defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif

--- a/src/collective/communicator.hip
+++ b/src/collective/communicator.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "communicator.cu"
+#endif
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@ -41,7 +41,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    }

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-
    segments->clear();
    segments->resize(world_size_, 0);
    segments->at(rank_) = length_bytes;
@ -60,6 +59,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
      Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }
+
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                             cudaMemcpyDefault));
  }
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@ -1,7 +1,7 @@
 /*!
 * Copyright 2023 XGBoost contributors
 */
-#if defined(XGBOOST_USE_NCCL)
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include "nccl_device_communicator.cuh"

 namespace xgboost {
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@ -35,12 +35,22 @@ class NcclDeviceCommunicator : public DeviceCommunicator {

 private:
  static constexpr std::size_t kUuidLength =
+#if defined(XGBOOST_USE_CUDA)
      sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
+#elif defined(XGBOOST_USE_HIP)
+      sizeof(hipUUID) / sizeof(uint64_t);
+#endif

  void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
+#if defined(XGBOOST_USE_CUDA)
    cudaDeviceProp prob{};
    dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
    std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
+#elif defined(XGBOOST_USE_HIP)
+    hipUUID id;
+    hipDeviceGetUuid(&id, device_ordinal_);
+    std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
+#endif
  }

  static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) {
--- a/src/collective/nccl_device_communicator.hip
+++ b/src/collective/nccl_device_communicator.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "nccl_device_communicator.cu"
+#endif
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@ -10,7 +10,13 @@

 #include <cstddef>             // size_t
 #include <cstdint>             // int32_t
+
+#if defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
+#elif defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>         // DispatchSegmentedRadixSort,NullType,DoubleBuffer
+#endif
+
 #include <iterator>            // distance
 #include <limits>              // numeric_limits
 #include <type_traits>         // conditional_t,remove_const_t
@ -39,6 +45,7 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
  using OffsetT = int;

  // Null value type
+#if defined(XGBOOST_USE_CUDA)
  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
  cub::DoubleBuffer<cub::NullType> d_values;

@ -47,6 +54,20 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
                 OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
                                    num_segments, d_begin_offsets, d_end_offsets, begin_bit,
                                    end_bit, false, ctx->Stream(), debug_synchronous)));
+#elif defined(XGBOOST_USE_HIP)
+  if (IS_DESCENDING) {
+      rocprim::segmented_radix_sort_pairs_desc<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
+              temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
+              num_segments, d_begin_offsets, d_end_offsets,
+              begin_bit, end_bit, ctx->Stream(), debug_synchronous);
+  }
+  else {
+      rocprim::segmented_radix_sort_pairs<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
+              temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
+              num_segments, d_begin_offsets, d_end_offsets,
+              begin_bit, end_bit, ctx->Stream(), debug_synchronous);
+  }
+#endif
 }

 // Wrapper around cub sort for easier `descending` sort.
@ -60,14 +81,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
                                  int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
+#if defined(XGBOOST_USE_CUDA)
  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
+#endif
+
  // In old version of cub, num_items in dispatch is also int32_t, no way to change.
  using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
                                     std::size_t, std::int32_t>;
  CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
  // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation

+#if defined(XGBOOST_USE_CUDA)
 #if THRUST_MAJOR_VERSION >= 2
  dh::safe_cuda((cub::DispatchSegmentedRadixSort<
                 descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
@ -88,6 +113,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
                                                          d_begin_offsets, d_end_offsets, begin_bit,
                                                          end_bit, false, stream, false)));
 #endif
+#elif defined(XGBOOST_USE_HIP)
+  if (descending) {
+      rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+              d_values_in, d_values_out, num_items, num_segments,
+              d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream, false);
+  }
+  else {
+      rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+              d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets,
+              begin_bit, end_bit, stream, false);
+  }
+#endif
 }
 }  // namespace detail

--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@ -17,14 +17,18 @@
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include "device_helpers.cuh"
-#endif  // defined(__CUDACC__)
+#elif defined(__HIPCC__)
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include "device_helpers.hip.h"
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)

 #include "xgboost/span.h"
 #include "common.h"

 namespace xgboost {

-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 using BitFieldAtomicType = unsigned long long;  // NOLINT

 __forceinline__ __device__ BitFieldAtomicType AtomicOr(BitFieldAtomicType* address,
@ -48,7 +52,7 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr

  return old;
 }
-#endif  // defined(__CUDACC__)
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)

 /**
 * @brief A non-owning type with auxiliary methods defined for manipulating bits.
@ -106,7 +110,7 @@ struct BitFieldContainer {
  XGBOOST_DEVICE static size_t ComputeStorageSize(index_type size) {
    return common::DivRoundUp(size, kValueSize);
  }
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t min_size = min(NumValues(), rhs.NumValues());
@ -123,9 +127,9 @@ struct BitFieldContainer {
    }
    return *this;
  }
-#endif  // #if defined(__CUDA_ARCH__)
+#endif  // #if defined(__CUDA_ARCH__) || defined(__HIPCC__)

-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
    size_t min_size = min(NumValues(), rhs.NumValues());
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -144,7 +148,7 @@ struct BitFieldContainer {
  }
 #endif  // defined(__CUDA_ARCH__)

-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  __device__ auto Set(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
    value_type& value = Data()[pos_v.int_pos];
@ -159,6 +163,14 @@ struct BitFieldContainer {
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
  }
+#ifdef __HIPCC__
+  void Clear(index_type pos) noexcept(true) {
+    Pos pos_v = Direction::Shift(ToBitPos(pos));
+    value_type& value = Data()[pos_v.int_pos];
+    value_type clear_bit = ~(kOne << pos_v.bit_pos);
+    value &= clear_bit;
+  }
+#endif
 #else
  void Set(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
@ -172,7 +184,7 @@ struct BitFieldContainer {
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    value &= clear_bit;
  }
-#endif  // defined(__CUDA_ARCH__)
+#endif  // defined(__CUDA_ARCH__) || defined(__HIPCC__)

  XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
    pos_v = Direction::Shift(pos_v);
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@ -52,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
 *
 *   Go to left if it's NOT the matching category, which matches one-hot encoding.
 */
-inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
+inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
  KCatBitField const s_cats(cats);
  if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
    return true;
--- a/src/common/common.cc
+++ b/src/common/common.cc
@ -55,7 +55,7 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
  }
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 int AllVisibleGPUs() { return 0; }
 #endif  // !defined(XGBOOST_USE_CUDA)

--- a/src/common/common.cu
+++ b/src/common/common.cu
@ -2,6 +2,7 @@
 * Copyright 2018-2022 XGBoost contributors
 */
 #include "common.h"
+#include "cuda_to_hip.h"

 namespace xgboost {
 namespace common {
--- a/src/common/common.h
+++ b/src/common/common.h
@ -26,6 +26,12 @@

 #define WITH_CUDA() true

+#elif defined(__HIPCC__)
+#include <thrust/system/hip/error.h>
+#include <thrust/system_error.h>
+
+#define WITH_CUDA() true
+
 #else

 #define WITH_CUDA() false
@ -39,8 +45,8 @@ namespace dh {
 */
 #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)

-inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                    int line) {
+inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, int line)
+{
  if (code != cudaSuccess) {
    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
                                       std::string{file} + ": " +  // NOLINT
@ -48,7 +54,23 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
  }
  return code;
 }
-#endif  // defined(__CUDACC__)
+
+#elif defined(__HIPCC__)
+/*
+ * Error handling  functions
+ */
+#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
+
+inline hipError_t ThrowOnCudaError(hipError_t code, const char *file, int line)
+{
+  if (code != hipSuccess) {
+    LOG(FATAL) << thrust::system_error(code, thrust::hip_category(),
+                                       std::string{file} + ": " +  // NOLINT
+                                       std::to_string(line)).what();
+  }
+  return code;
+}
+#endif
 }  // namespace dh

 namespace xgboost::common {
@ -159,9 +181,9 @@ class Range {
 int AllVisibleGPUs();

 inline void AssertGPUSupport() {
-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
    LOG(FATAL) << "XGBoost version not compiled with GPU support.";
-#endif  // XGBOOST_USE_CUDA
+#endif  // XGBOOST_USE_CUDA && XGBOOST_USE_HIP
 }

 inline void AssertOneAPISupport() {
@ -172,7 +194,7 @@ inline void AssertOneAPISupport() {

 void SetDevice(std::int32_t device);

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline void SetDevice(std::int32_t device) {
  if (device >= 0) {
    AssertGPUSupport();
--- a/src/common/common.hip
+++ b/src/common/common.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "common.cu"
+#endif
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@ -11,9 +11,11 @@

 #include "common.h"

-#ifdef __CUDACC__
+#if defined(__CUDACC__)
 #include "device_helpers.cuh"
-#endif  // __CUDACC__
+#elif defined(__HIPCC__)
+#include "device_helpers.hip.h"
+#endif  // __CUDACC__ || __HIPCC__

 namespace xgboost {
 namespace common {
@ -105,7 +107,7 @@ class CompressedBufferWriter {
    }
  }

-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
  __device__ void AtomicWriteSymbol
    (CompressedByteT* buffer, uint64_t symbol, size_t offset) {
    size_t ibit_start = offset * symbol_bits_;
@ -119,7 +121,7 @@ class CompressedBufferWriter {
      symbol >>= 8;
    }
  }
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__

  template <typename IterT>
  void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@ -4,7 +4,6 @@
 #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #include <thrust/execution_policy.h>
-
 #include "device_helpers.cuh"

 namespace xgboost {
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@ -72,11 +72,23 @@ class pinned_allocator {
    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if

    pointer result(nullptr);
+
+#if defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+#else
    dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+#endif
+
    return result;
  }

-  inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT
+  inline void deallocate(pointer p, size_type) {
+#if defined(XGBOOST_USE_HIP)
+      dh::safe_cuda(hipHostFree(p));
+#else
+      dh::safe_cuda(cudaFreeHost(p));
+#endif
+  } // NOLINT

  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT

--- a/src/common/cuda_to_hip.h
+++ b/src/common/cuda_to_hip.h
@ -0,0 +1,71 @@
+/**
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#pragma once
+
+#if defined(XGBOOST_USE_HIP)
+
+#define cudaSuccess                  hipSuccess
+#define cudaGetLastError             hipGetLastError
+
+#define cudaStream_t                 hipStream_t
+#define cudaStreamCreate             hipStreamCreate
+#define cudaStreamCreateWithFlags    hipStreamCreateWithFlags
+#define cudaStreamDestroy            hipStreamDestroy
+#define cudaStreamWaitEvent          hipStreamWaitEvent
+#define cudaStreamSynchronize        hipStreamSynchronize
+#define cudaStreamPerThread          hipStreamPerThread
+#define cudaStreamLegacy             hipStreamLegacy
+
+#define cudaEvent_t                  hipEvent_t
+#define cudaEventCreate              hipEventCreate
+#define cudaEventCreateWithFlags     hipEventCreateWithFlags
+#define cudaEventDestroy             hipEventDestroy
+
+#define cudaGetDevice                hipGetDevice
+#define cudaSetDevice                hipSetDevice
+#define cudaGetDeviceCount           hipGetDeviceCount
+#define cudaDeviceSynchronize        hipDeviceSynchronize
+
+#define cudaGetDeviceProperties      hipGetDeviceProperties
+#define cudaDeviceGetAttribute       hipDeviceGetAttribute
+
+#define cudaMallocHost               hipMallocHost
+#define cudaFreeHost                 hipFreeHost
+#define cudaMalloc                   hipMalloc
+#define cudaFree                     hipFree
+
+#define cudaMemcpy                   hipMemcpy
+#define cudaMemcpyAsync              hipMemcpyAsync
+#define cudaMemcpyDefault            hipMemcpyDefault
+#define cudaMemcpyHostToDevice       hipMemcpyHostToDevice
+#define cudaMemcpyHostToHost         hipMemcpyHostToHost
+#define cudaMemcpyDeviceToHost       hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice     hipMemcpyDeviceToDevice
+#define cudaMemsetAsync              hipMemsetAsync
+#define cudaMemset                   hipMemset
+
+#define cudaPointerAttributes        hipPointerAttribute_t 
+#define cudaPointerGetAttributes     hipPointerGetAttributes
+
+#define cudaMemGetInfo               hipMemGetInfo
+#define cudaFuncSetAttribute         hipFuncSetAttribute
+
+#define cudaDevAttrMultiProcessorCount                hipDeviceAttributeMultiprocessorCount
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
+
+namespace thrust {
+    namespace hip {
+    }
+}
+
+namespace thrust {
+    namespace cuda = thrust::hip;
+}
+
+namespace hipcub {
+}
+
+namespace cub = hipcub;
+
+#endif
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -2,6 +2,8 @@
 * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
+
+#if defined(XGBOOST_USE_CUDA)
 #include <thrust/binary_search.h>  // thrust::upper_bound
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
@ -1218,3 +1220,7 @@ class LDGIterator {
  }
 };
 }  // namespace dh
+
+#elif defined(XGBOOST_USE_HIP)
+#include "device_helpers.hip.h"
+#endif
--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@ -3,9 +3,11 @@
 */
 #include "error_msg.h"

+#include <mutex>    // for call_once, once_flag
 #include <sstream>  // for stringstream

 #include "../collective/communicator-inl.h"  // for GetRank
+#include "xgboost/context.h"                 // for Context
 #include "xgboost/logging.h"

 namespace xgboost::error {
@ -26,34 +28,43 @@ void WarnDeprecatedGPUHist() {
 }

 void WarnManualUpdater() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
+  static std::once_flag flag;
+  std::call_once(flag, [] {
    LOG(WARNING)
        << "You have manually specified the `updater` parameter. The `tree_method` parameter "
           "will be ignored. Incorrect sequence of updaters will produce undefined "
           "behavior. For common uses, we recommend using `tree_method` parameter instead.";
-  logged = true;
+  });
 }

 void WarnDeprecatedGPUId() {
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
+  static std::once_flag flag;
+  std::call_once(flag, [] {
    auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
    msg += " E.g. device=cpu/cuda/cuda:0";
    LOG(WARNING) << msg;
-  logged = true;
+  });
 }

 void WarnEmptyDataset() {
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag,
+                 [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
+}
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  static std::once_flag flag;
+  std::call_once(flag, [&] {
+    LOG(WARNING)
+        << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+           "lead to higher memory usage and slower performance. XGBoost is running on: "
+        << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
+        << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once.
+)";
+  });
 }
 }  // namespace xgboost::error
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@ -11,6 +11,7 @@
 #include <string>     // for string

 #include "xgboost/base.h"     // for bst_feature_t
+#include "xgboost/context.h"  // for Context
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView

@ -94,5 +95,7 @@ constexpr StringView InvalidCUDAOrdinal() {
  return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
         "available for using GPU.";
 }
+
+void MismatchedDevices(Context const* booster, Context const* data);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -84,10 +84,19 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 template <std::uint32_t kBlockThreads, typename Kernel>
 std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
  int n_mps = 0;
+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
+#endif
  int n_blocks_per_mp = 0;
+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                              kBlockThreads, shared_mem));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                              kBlockThreads, shared_mem));
+#endif
  std::uint32_t grid_size = n_blocks_per_mp * n_mps;
  return grid_size;
 }
@ -170,6 +179,7 @@ void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per
      column_sizes_scan->begin(), [=] __device__(size_t column_size) {
        return thrust::min(num_cuts_per_feature, column_size);
      });
+
  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
@ -294,6 +304,7 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
                                 &column_sizes_scan,
                                 &sorted_entries);
  dh::XGBDeviceAllocator<char> alloc;
+
  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
               sorted_entries.end(), detail::EntryCompareOp());

@ -353,11 +364,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
          bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
          return weights[group_idx];
        });
+
    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
                                 weight_iter + begin, weight_iter + end,
                                 batch_iter + begin,
                                 d_temp_weights.data(),  // output
                                 is_valid);
+
    CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
  } else {
    CHECK_EQ(batch.NumRows(), weights.size());
@ -366,11 +379,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
        [=]__device__(size_t idx) -> float {
          return weights[batch.GetElement(idx).row_idx];
        });
+
    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
                                 weight_iter + begin, weight_iter + end,
                                 batch_iter + begin,
                                 d_temp_weights.data(),  // output
                                 is_valid);
+
    CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
  }

--- a/src/common/hist_util.hip
+++ b/src/common/hist_util.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "hist_util.cu"
+#endif
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@ -1,7 +1,7 @@
 /**
 * Copyright 2017-2023 by XGBoost contributors
 */
-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 // dummy implementation of HostDeviceVector in case CUDA is not used

@ -199,4 +199,4 @@ template class HostDeviceVector<std::size_t>;

 }  // namespace xgboost

-#endif  // XGBOOST_USE_CUDA
+#endif  // XGBOOST_USE_CUDA && !defined(XGBOOST_USE_HIP)
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -139,6 +139,7 @@ class HostDeviceVectorImpl {
      auto ptr = other->ConstDevicePointer();
      SetDevice();
      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
+
      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                    ptr,
                                    other->Size() * sizeof(T),
@ -195,6 +196,7 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
    SetDevice();
+
    dh::safe_cuda(cudaMemcpy(data_h_.data(),
                             data_d_->data().get(),
                             data_d_->size() * sizeof(T),
@ -211,6 +213,7 @@ class HostDeviceVectorImpl {
    // data is on the host
    LazyResizeDevice(data_h_.size());
    SetDevice();
+
    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
                                  data_h_.data(),
                                  data_d_->size() * sizeof(T),
@ -239,6 +242,7 @@ class HostDeviceVectorImpl {
      LazyResizeDevice(Size());
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
+
      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
    }
@ -248,6 +252,7 @@ class HostDeviceVectorImpl {
    LazyResizeDevice(Size());
    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
+
    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
  }
--- a/src/common/host_device_vector.hip
+++ b/src/common/host_device_vector.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "host_device_vector.cu"
+#endif
--- a/src/common/io.h
+++ b/src/common/io.h
@ -384,7 +384,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
   * @param length    See the `length` parameter of `mmap` for details.
   */
  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+      : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
+            new MmapResource{std::move(path), offset, length}}} {}
  ~PrivateMmapConstStream() noexcept(false) override;
 };

--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@ -12,7 +12,8 @@
 namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
-void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
+{
  dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
  static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
                "For function with return, use transform instead.");
@ -28,7 +29,8 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s
 }

 template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
+{
  if (t.Contiguous()) {
    auto ptr = t.Values().data();
    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@ -42,7 +42,7 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& f
  }
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
  common::AssertGPUSupport();
@ -60,7 +60,7 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
  }
  ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 template <typename T, std::int32_t kDim>
 auto cbegin(TensorView<T, kDim> const& v) {  // NOLINT
--- a/src/common/math.h
+++ b/src/common/math.h
@ -143,7 +143,7 @@ CheckNAN(T) {
  return false;
 }

-#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
+#if XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__) && !defined(__HIPCC__)

 bool CheckNAN(double v);

@ -152,23 +152,27 @@ bool CheckNAN(double v);
 XGBOOST_DEVICE bool inline CheckNAN(float x) {
 #if defined(__CUDA_ARCH__)
  return isnan(x);
+#elif defined(__HIPCC__)
+  return __builtin_isnan(x);
 #else
  return std::isnan(x);
-#endif  // defined(__CUDA_ARCH__)
+#endif  // defined(__CUDA_ARCH__) || defined(__HIPCC__)
 }

 XGBOOST_DEVICE bool inline CheckNAN(double x) {
 #if defined(__CUDA_ARCH__)
  return isnan(x);
+#elif defined(__HIPCC__)
+  return __builtin_isnan(x);
 #else
  return std::isnan(x);
-#endif  // defined(__CUDA_ARCH__)
+#endif  // defined(__CUDA_ARCH__) || defined(__HIPCC__)
 }

 #endif  // XGBOOST_STRICT_R_MODE && !defined(__CUDA_ARCH__)
 // GPU version is not uploaded in CRAN anyway.
 // Specialize only when using R with CPU.
-#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA)
+#if XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 double LogGamma(double v);

 #else  // Not R or R with GPU.
@ -191,7 +195,7 @@ XGBOOST_DEVICE inline T LogGamma(T v) {
 #endif  // _MSC_VER
 }

-#endif  // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA)
+#endif  // XGBOOST_STRICT_R_MODE && !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 }  // namespace common
 }  // namespace xgboost
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@ -15,6 +15,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
  values.SetDevice(ctx->gpu_id);
  auto const d_values = values.ConstDeviceSpan();
  dh::XGBCachingDeviceAllocator<char> alloc;
+
  return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
                    thrust::plus<float>{});
 }
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@ -99,12 +99,12 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {

 namespace cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline double Reduce(Context const*, HostDeviceVector<float> const&) {
  AssertGPUSupport();
  return 0;
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl

 /**
--- a/src/common/numeric.hip
+++ b/src/common/numeric.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "numeric.cu"
+#endif
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@ -109,6 +109,7 @@ template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
  CHECK_EQ(out.size(), src.size());
  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
+
  dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                cudaMemcpyDefault));
@ -162,6 +163,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
  // Compute output ptr
  auto transform_it =
      thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
+
  thrust::transform(
      thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
      out_ptr.data(),
@ -211,6 +213,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
               Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
               Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
  dh::safe_cuda(cudaSetDevice(device));
+
  CHECK_EQ(d_x.size() + d_y.size(), out.size());
  CHECK_EQ(x_ptr.size(), out_ptr.size());
  CHECK_EQ(y_ptr.size(), out_ptr.size());
@ -309,6 +312,7 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                           common::Span<OffsetT> cuts_ptr,
                           size_t total_cuts, Span<float> weights) {
  dh::safe_cuda(cudaSetDevice(device_));
+
  Span<SketchEntry> out;
  dh::device_vector<SketchEntry> cuts;
  bool first_window = this->Current().empty();
@ -378,6 +382,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
      });
  // Reverse scan to accumulate weights into first duplicated element on left.
  auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
+
  thrust::inclusive_scan_by_key(
      thrust::cuda::par(alloc), key_it, key_it + entries.size(),
      val_it, val_it,
@ -443,6 +448,7 @@ void SketchContainer::Prune(size_t to) {
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                            Span<SketchEntry const> that) {
  dh::safe_cuda(cudaSetDevice(device_));
+
  timer_.Start(__func__);
  if (this->Current().size() == 0) {
    CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@ -478,6 +484,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,

 void SketchContainer::FixError() {
  dh::safe_cuda(cudaSetDevice(device_));
+
  auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
  auto in = dh::ToSpan(this->Current());
  dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@ -635,10 +642,12 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
    CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
    max_values.resize(d_in_columns_ptr.size() - 1);
    dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
+
    thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
                          thrust::make_discard_iterator(), d_max_values.begin(),
                          thrust::equal_to<bst_feature_t>{},
                          [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
+
    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
    auto max_it = MakeIndexTransformIter([&](auto i) {
      if (IsCat(h_feature_types, i)) {
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -175,6 +175,7 @@ class SketchContainer {
  template <typename KeyComp = thrust::equal_to<size_t>>
  size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
    timer_.Start(__func__);
+
    dh::safe_cuda(cudaSetDevice(device_));
    this->columns_ptr_.SetDevice(device_);
    Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
@ -186,11 +187,13 @@ class SketchContainer {
    dh::XGBCachingDeviceAllocator<char> alloc;

    d_column_scan = this->columns_ptr_.DeviceSpan();
+
    size_t n_uniques = dh::SegmentedUnique(
        thrust::cuda::par(alloc), d_column_scan.data(),
        d_column_scan.data() + d_column_scan.size(), entries.data(),
        entries.data() + entries.size(), scan_out.DevicePointer(),
        entries.data(), detail::SketchUnique{}, key_comp);
+
    this->columns_ptr_.Copy(scan_out);
    CHECK(!this->columns_ptr_.HostCanRead());

--- a/src/common/quantile.hip
+++ b/src/common/quantile.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "quantile.cu"
+#endif
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@ -62,7 +62,7 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
  return rank;
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
                                                             common::Span<float const>) {
@ -108,9 +108,9 @@ void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
  });
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 DMLC_REGISTER_PARAMETER(LambdaRankParam);

@ -120,7 +120,7 @@ void PreCache::InitOnCPU(Context const*, MetaInfo const& info) {
                 [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 #endif  // !defined(XGBOOST_USE_CUDA)

@ -130,9 +130,9 @@ void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
                 [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
  std::string out_name;
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@ -23,6 +23,10 @@
 #include "xgboost/logging.h"                    // for CHECK
 #include "xgboost/span.h"                       // for Span

+#if defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
+#endif
+
 namespace xgboost::ltr {
 namespace cuda_impl {
 void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
--- a/src/common/ranking_utils.hip
+++ b/src/common/ranking_utils.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "ranking_utils.cu"
+#endif
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@ -76,7 +76,7 @@ class RefResourceView {

  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
  [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
+    return Span<const value_type>{data(), size()}.size_bytes();
  }
  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@ -18,6 +18,7 @@
 namespace xgboost {
 namespace common {
 namespace cuda_impl {
+
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
            common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
  CHECK_GE(t.Shape(1), 1);
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@ -216,6 +216,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                                                         detail::SegOp<SegIt>{seg_beg, seg_end});
  auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                                   detail::WeightOp<WIter>{w_begin, d_sorted_idx});
+
  thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
                                scan_val, weights_cdf.begin());

--- a/src/common/stats.h
+++ b/src/common/stats.h
@ -112,7 +112,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWe

 void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);

-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights,
                   linalg::Tensor<float, 1>*) {
  common::AssertGPUSupport();
@ -120,7 +120,7 @@ inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalW
 inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
  common::AssertGPUSupport();
 }
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl

 /**
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Hui Liu	1ce5029a96	fix memory type	2024-01-26 15:45:04 -08:00
Hui Liu	420f8d6fde	merge v2.0.3 from upstream	2024-01-25 07:40:06 -08:00
Hui Liu	dc7ee041cc	use __HIPCC__ for device code	2024-01-24 12:32:51 -08:00
Hui Liu	7dc152450e	workaround memoryType and change rccl config	2024-01-11 13:13:14 -08:00
Philip Hyunsu Cho	82d846bbeb	Update change_scala_version.py to also change scala.version property (#9897 )	2023-12-18 23:49:41 -08:00
Philip Hyunsu Cho	71d330afdc	Bump version to 2.0.3 (#9895 )	2023-12-14 17:54:05 -08:00
Philip Hyunsu Cho	3acbd8692b	[jvm-packages] Fix POM for xgboost-jvm metapackage (#9893 ) * [jvm-packages] Fix POM for xgboost-jvm metapackage * Add script for updating the Scala version	2023-12-14 16:50:34 -08:00
Philip Hyunsu Cho	ad524f76ab	[backport] [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9887 ) * [CI] Set up CI for Mac M1 (#9699) * [CI] Improve CI for Mac M1 (#9748) * [CI] Build libxgboost4j.dylib with CMAKE_OSX_DEPLOYMENT_TARGET (#9749) * [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9886)	2023-12-13 16:05:40 -08:00
Jiaming Yuan	d2d1751c03	[backport][py] Use the first found native library. (#9860 ) (#9879 )	2023-12-13 14:20:30 +08:00
Jiaming Yuan	e4ee4e79dc	[backport][sklearn] Fix loading model attributes. (#9808 ) (#9880 )	2023-12-13 14:20:04 +08:00
Philip Hyunsu Cho	41ce8f28b2	[jvm-packages] Add Scala version suffix to xgboost-jvm package (#9776 ) * Update JVM script (#9714) * Bump version to 2.0.2; revamp pom.xml * Update instructions in prepare_jvm_release.py * Fix formatting	2023-11-08 10:17:26 -08:00
Jiaming Yuan	0ffc52e05c	[backport] Fix using categorical data with the ranker. (#9753 ) (#9778 )	2023-11-09 01:20:52 +08:00
Hui Liu	82d81bca94	rm hip.h files	2023-10-30 21:54:00 -07:00
Hui Liu	6ec5cf26fc	enable 3 more tests	2023-10-30 15:27:02 -07:00
Hui Liu	1ec57fd1a3	enable ROCm support, rm un-necessary code	2023-10-30 12:39:30 -07:00
Hui Liu	d0774a78e4	add hip to config	2023-10-30 12:01:24 -07:00
Hui Liu	8d160a206e	add jvm rocm support	2023-10-30 11:49:47 -07:00
Hui Liu	a41bc0975c	rocm enable for v2.0.1, rm setup.py	2023-10-27 18:53:16 -07:00
Hui Liu	782b73f2bb	rocm enable for v2.0.1	2023-10-27 18:50:28 -07:00
Philip Hyunsu Cho	a408254c2f	Use sys.base_prefix instead of sys.prefix (#9711 ) * Use sys.base_prefix instead of sys.prefix * Update libpath.py too	2023-10-23 23:31:40 -07:00
Philip Hyunsu Cho	22e891dafa	[jvm-packages] Remove hard dependency on libjvm (#9698 ) (#9705 )	2023-10-23 21:21:14 -07:00
Philip Hyunsu Cho	89530c80a7	[CI] Build libxgboost4j.dylib for Intel Mac (#9704 )	2023-10-23 20:45:01 -07:00
Philip Hyunsu Cho	946ab53b57	Fix libpath logic for Windows (#9687 )	2023-10-19 10:42:46 -07:00
Philip Hyunsu Cho	afd03a6934	Fix build for AppleClang 11 (#9684 )	2023-10-18 09:35:59 -07:00
Jiaming Yuan	f7da938458	[backport][pyspark] Support stage-level scheduling (#9519 ) (#9686 ) Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-10-18 14:05:08 +08:00
Philip Hyunsu Cho	6ab6577511	Fix build for GCC 8.x (#9670 )	2023-10-12 23:36:41 -07:00
Philip Hyunsu Cho	8c57558d74	[backport] [CI] Pull CentOS 7 images from NGC (#9666 ) (#9668 )	2023-10-13 14:09:54 +08:00
Jiaming Yuan	58aa98a796	Bump version to 2.0.1. (#9660 )	2023-10-13 08:47:32 +08:00
Jiaming Yuan	92273b39d8	[backport] Add support for cgroupv2. (#9651 ) (#9656 )	2023-10-12 11:39:27 +08:00
Jiaming Yuan	e824b18bf6	[backport] Support pandas 2.1.0. (#9557 ) (#9655 )	2023-10-12 11:29:59 +08:00
Jiaming Yuan	66ee89d8b4	[backport] Workaround Apple clang issue. (#9615 ) (#9636 )	2023-10-08 15:42:15 +08:00
Jiaming Yuan	54d1d72d01	[backport] Use array interface for testing numpy arrays. (#9602 ) (#9635 )	2023-10-08 11:45:49 +08:00
Jiaming Yuan	032bcc57f9	[backport][R] Fix method name. (#9577 ) (#9592 )	2023-09-19 02:08:46 +08:00
Jiaming Yuan	ace7713201	[backport] Fix default metric configuration. (#9575 ) (#9590 )	2023-09-18 23:40:43 +08:00
Jiaming Yuan	096047c547	Make 2.0 release. (#9567 )	2023-09-12 00:20:49 +08:00
Jiaming Yuan	e75dd75bb2	[backport] [pyspark] support gpu transform (#9542 ) (#9559 ) --------- Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-09-07 17:21:09 +08:00
Jiaming Yuan	4d387cbfbf	[backport] [pyspark] rework transform to reuse same code (#9292 ) (#9558 ) Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-09-07 15:26:24 +08:00
Jiaming Yuan	3fde9361d7	[backport] Fix inplace predict with fallback when base margin is used. (#9536 ) (#9548 ) - Copy meta info from proxy DMatrix. - Use `std::call_once` to emit less warnings.	2023-09-05 23:38:06 +08:00
Jiaming Yuan	b67c2ed96d	[backport] [CI] bump `setup-r` action version. (#9544 ) (#9551 )	2023-09-05 22:10:30 +08:00
Jiaming Yuan	177fd79864	[backport] Fix read the doc configuration. [skip ci] (#9549 )	2023-09-05 17:32:00 +08:00
Jiaming Yuan	06487d3896	[backport] Fix GPU categorical split memory allocation. (#9529 ) (#9535 )	2023-08-29 21:14:43 +08:00
Jiaming Yuan	e50ccc4d3c	[R] Fix integer inputs with NA. (#9522 ) (#9534 )	2023-08-29 19:52:13 +08:00
Jiaming Yuan	add57f8880	[backport] Delay the check for vector leaf. (#9509 ) (#9533 )	2023-08-29 18:25:59 +08:00
Jiaming Yuan	a0d3573c74	[backport] Fix device dispatch for linear updater. (#9507 ) (#9532 )	2023-08-29 15:10:43 +08:00
Jiaming Yuan	4301558a57	Make 2.0.0 RC1. (#9492 )	2023-08-17 16:16:51 +08:00
amdsc21	2e7e9d3b2d	update rocgputreeshap branch	2023-06-23 19:50:08 +02:00
amdsc21	3e0c7d1dee	new url for rocgputreeshap	2023-06-23 19:46:45 +02:00
amdsc21	2f47a1ebe6	rm warp-primitives	2023-06-22 21:43:00 +02:00
amdsc21	5ca7daaa13	merge latest changes	2023-06-15 21:39:14 +02:00
amdsc21	5f78360949	merge changes Jun092023	2023-06-09 22:41:33 +02:00
amdsc21	35cde3b1b2	remove some hip.h	2023-06-07 04:48:09 +02:00
amdsc21	ce345c30a8	remove some hip.h	2023-06-07 03:39:01 +02:00
amdsc21	af8845405a	sync Jun 5	2023-06-07 02:43:21 +02:00
amdsc21	9ee1852d4e	restore device helper	2023-06-02 02:55:13 +02:00
Your Name	6ecd7903f2	Merge branch 'master' into sync-condition-2023Jun01	2023-06-01 15:58:31 -07:00
Your Name	42867a4805	sync Jun 1	2023-06-01 15:55:06 -07:00
amdsc21	c5b575e00e	fix host __assert_fail	2023-05-24 19:40:24 +02:00
amdsc21	1354138b7d	Merge branch 'master' into sync-condition-2023May15	2023-05-24 17:44:16 +02:00
amdsc21	b994a38b28	Merge branch 'master' into sync-condition-2023May15	2023-05-23 01:07:50 +02:00
amdsc21	3a834c4992	change workflow	2023-05-20 07:04:06 +02:00
amdsc21	b22644fc10	add hip.h	2023-05-20 01:25:33 +02:00
amdsc21	7663d47383	Merge branch 'master' into sync-condition-2023May15	2023-05-19 20:30:35 +02:00
amdsc21	88fc8badfa	Merge branch 'master' into sync-condition-2023May15	2023-05-17 19:55:50 +02:00
amdsc21	8cad8c693c	sync up May15 2023	2023-05-15 18:59:18 +02:00
amdsc21	b066accad6	fix lambdarank_obj	2023-05-02 21:06:22 +02:00
amdsc21	b324d51f14	fix array_interface.h half type	2023-05-02 20:50:50 +02:00
amdsc21	65097212b3	fix IterativeDeviceDMatrix, support HIP	2023-05-02 20:20:11 +02:00
amdsc21	4a24ca2f95	fix helpers.h, enable HIP	2023-05-02 20:04:23 +02:00
amdsc21	83e6fceb5c	fix lambdarank_obj.cc, support HIP	2023-05-02 19:03:18 +02:00
amdsc21	e4538cb13c	fix, to support hip	2023-05-02 17:43:11 +02:00
amdsc21	5446c501af	merge 23Mar01	2023-05-02 00:05:58 +02:00
amdsc21	313a74b582	add Shap Magic to check if use cat	2023-05-01 21:55:14 +02:00
amdsc21	65d83e288f	fix device query	2023-04-19 19:53:26 +02:00
amdsc21	f645cf51c1	Merge branch 'master' into sync-condition-2023Apr11	2023-04-17 18:33:00 +02:00
amdsc21	db8420225b	fix RCCL	2023-04-12 01:09:14 +02:00
amdsc21	843fdde61b	sync Apr 11 2023	2023-04-11 20:03:25 +02:00
amdsc21	08bc4b0c0f	Merge branch 'master' into sync-condition-2023Apr11	2023-04-11 19:38:38 +02:00
amdsc21	6825d986fd	move Dockerfile to ci	2023-04-11 19:34:23 +02:00
paklui	d155ec77f9	building docker for xgboost-amd-condition	2023-03-30 13:36:39 -07:00
amdsc21	991738690f	Merge branch 'sync-condition-2023Mar27' into amd-condition	2023-03-30 05:16:36 +02:00
amdsc21	aeb3fd1c95	Merge branch 'master' into sync-condition-2023Mar27	2023-03-30 05:15:55 +02:00
amdsc21	141a062e00	Merge branch 'sync-condition-2023Mar27' into amd-condition	2023-03-30 00:47:16 +02:00
amdsc21	acad01afc9	sync Mar 29	2023-03-30 00:46:50 +02:00
amdsc21	f289e5001d	Merge branch 'sync-condition-2023Mar27' into amd-condition	2023-03-28 00:24:12 +02:00
amdsc21	06d9b998ce	fix CAPI BuildInfo	2023-03-28 00:14:18 +02:00
amdsc21	c50cc424bc	sync Mar 27 2023	2023-03-27 18:54:41 +02:00
amdsc21	8c77e936d1	tune grid size	2023-03-26 17:45:19 +02:00
amdsc21	18034a4291	tune histogram	2023-03-26 01:42:51 +01:00
amdsc21	7ee4734d3a	rm device_helpers.hip.h from cu	2023-03-26 00:24:11 +01:00
amdsc21	ee582f03c3	rm device_helpers.hip.h from cuh	2023-03-25 23:35:57 +01:00
amdsc21	f3286bac04	rm warp header	2023-03-25 23:01:44 +01:00
amdsc21	3ee3bea683	fix warp header	2023-03-25 22:37:37 +01:00
amdsc21	5098735698	Merge branch 'condition-sync-Mar24-23' into hui-condition	2023-03-25 05:28:40 +01:00
amdsc21	e74b3bbf3c	fix macro	2023-03-25 05:17:39 +01:00
amdsc21	22525c002a	fix macro	2023-03-25 05:08:30 +01:00
amdsc21	80961039d7	fix macro	2023-03-25 05:00:55 +01:00
amdsc21	1474789787	add new file	2023-03-25 04:54:02 +01:00
amdsc21	1dc138404a	initial merge, fix linalg.h	2023-03-25 04:48:47 +01:00
amdsc21	e1d050f64e	initial merge, fix linalg.h	2023-03-25 04:37:43 +01:00
amdsc21	7fbc561e17	initial merge	2023-03-25 04:31:55 +01:00
amdsc21	d97be6f396	enable last 3 tests	2023-03-25 04:05:05 +01:00
amdsc21	f1211cffca	enable last 3 tests	2023-03-25 00:45:52 +01:00
amdsc21	e0716afabf	fix objective/objective.cc, CMakeFile and setup.py	2023-03-23 20:22:34 +01:00
amdsc21	595cd81251	add max shared mem workaround	2023-03-19 20:08:42 +01:00
amdsc21	0325ce0bed	update gputreeshap	2023-03-19 20:07:36 +01:00
amdsc21	a79a35c22c	add warp size	2023-03-15 22:00:26 +01:00
amdsc21	4484c7f073	disable Optin Shared Mem	2023-03-15 02:10:16 +01:00
amdsc21	8207015e48	fix ../tests/cpp/common/test_span.h	2023-03-14 22:19:06 +01:00
amdsc21	364df7db0f	fix ../tree/gpu_hist/evaluate_splits.hip bugs, size 64	2023-03-14 06:17:21 +01:00
amdsc21	a2bab03205	fix aft_obj.hip	2023-03-13 23:19:59 +01:00
amdsc21	b71c1b50de	fix macro, no !	2023-03-12 23:02:28 +01:00
amdsc21	fa2336fcfd	sort bug fix	2023-03-12 07:09:10 +01:00
amdsc21	7d96758382	macro format	2023-03-11 06:57:24 +01:00
amdsc21	b0dacc5a80	fix bug	2023-03-11 03:47:23 +01:00
amdsc21	f64152bf97	add helpers.hip	2023-03-11 02:56:50 +01:00
amdsc21	b4dbe7a649	fix isnan	2023-03-11 02:39:58 +01:00
amdsc21	e5b6219a84	typo	2023-03-11 02:30:27 +01:00
amdsc21	3a07b1edf8	complete test porting	2023-03-11 02:17:05 +01:00
amdsc21	9bf16a2ca6	testing porting	2023-03-11 01:38:54 +01:00
amdsc21	332f6a89a9	more tests	2023-03-11 01:33:48 +01:00
amdsc21	204d0c9a53	add hip tests	2023-03-11 00:38:16 +01:00
amdsc21	e961016e71	rm HIPCUB	2023-03-10 22:21:37 +01:00
amdsc21	f0b8c02f15	merge latest changes	2023-03-10 22:10:20 +01:00
amdsc21	5e8b1842b9	fix Pointer Attr	2023-03-10 19:06:02 +01:00
amdsc21	9f072b50ba	fix __popc	2023-03-10 17:14:31 +01:00
amdsc21	e1ddb5ae58	fix macro XGBOOST_USE_HIP	2023-03-10 07:11:05 +01:00
amdsc21	643e2a7b39	fix macro XGBOOST_USE_HIP	2023-03-10 07:09:41 +01:00
amdsc21	bde3107c3e	fix macro XGBOOST_USE_HIP	2023-03-10 07:01:25 +01:00
amdsc21	5edfc1e2e9	finish ellpack_page.cc	2023-03-10 06:41:25 +01:00
amdsc21	c073417d0c	finish aft_obj.cu	2023-03-10 06:39:03 +01:00
amdsc21	9bbbeb3f03	finish multiclass_obj.cu	2023-03-10 06:35:46 +01:00
amdsc21	4bde2e3412	finish multiclass_obj.cu	2023-03-10 06:35:21 +01:00
amdsc21	58a9fe07b6	finish multiclass_obj.cu	2023-03-10 06:35:06 +01:00
amdsc21	41407850d5	finish rank_obj.cu	2023-03-10 06:29:08 +01:00
amdsc21	968a1db4c0	finish regression_obj.cu	2023-03-10 06:07:53 +01:00
amdsc21	ad710e4888	finish hinge.cu	2023-03-10 06:04:59 +01:00
amdsc21	4e3c699814	finish adaptive.cu	2023-03-10 06:02:48 +01:00
amdsc21	757de84398	finish quantile.cu	2023-03-10 05:55:51 +01:00
amdsc21	d27f9dfdce	finish host_device_vector.cu	2023-03-10 05:45:38 +01:00
amdsc21	14cc438a64	finish stats.cu	2023-03-10 05:38:16 +01:00
amdsc21	911a5d8a60	finish hist_util.cu	2023-03-10 05:32:38 +01:00
amdsc21	54b076b40f	finish common.cu	2023-03-10 05:20:29 +01:00
amdsc21	91a5ef762e	finish common.cu	2023-03-10 05:19:41 +01:00
amdsc21	8fd2af1c8b	finish numeric.cu	2023-03-10 05:16:23 +01:00
amdsc21	bb6adda8a3	finish c_api.cu	2023-03-10 05:12:51 +01:00
amdsc21	a76ccff390	finish c_api.cu	2023-03-10 05:11:20 +01:00
amdsc21	61c0b19331	finish ellpack_page_source.cu	2023-03-10 05:06:36 +01:00
amdsc21	fa9f69dd85	finish sparse_page_dmatrix.cu	2023-03-10 05:04:57 +01:00
amdsc21	080fc35c4b	finish ellpack_page_raw_format.cu	2023-03-10 05:02:35 +01:00
amdsc21	ccce4cf7e1	finish data.cu	2023-03-10 05:00:57 +01:00
amdsc21	713ab9e1a0	finish sparse_page_source.cu	2023-03-10 04:42:56 +01:00
amdsc21	134cbfddbe	finish gradient_index.cu	2023-03-10 04:40:33 +01:00
amdsc21	6e2c5be83e	finish array_interface.cu	2023-03-10 04:36:04 +01:00
amdsc21	185dbce21f	finish ellpack_page.cu	2023-03-10 04:26:09 +01:00
amdsc21	49732359ef	finish iterative_dmatrix.cu	2023-03-10 03:47:00 +01:00
amdsc21	ec9f500a49	finish proxy_dmatrix.cu	2023-03-10 03:40:07 +01:00
amdsc21	53244bef6f	finish simple_dmatrix.cu	2023-03-10 03:38:09 +01:00
amdsc21	f0febfbcac	finish gpu_predictor.cu	2023-03-10 01:29:54 +01:00
amdsc21	1c58ff61d1	finish fit_stump.cu	2023-03-10 00:46:29 +01:00
amdsc21	1530c03f7d	finish constraints.cu	2023-03-09 22:43:51 +01:00
amdsc21	309268de02	finish updater_gpu_hist.cu	2023-03-09 22:40:44 +01:00
amdsc21	500428cc0f	finish row_partitioner.cu	2023-03-09 22:31:11 +01:00
amdsc21	495816f694	finished gradient_based_sampler.cu	2023-03-09 22:26:08 +01:00
amdsc21	df42dd2c53	finished evaluator.cu	2023-03-09 22:22:05 +01:00
amdsc21	f55243fda0	finish evaluate_splits.cu	2023-03-09 22:15:10 +01:00
amdsc21	1e09c21456	finished feature_groups.cu	2023-03-09 21:31:00 +01:00
amdsc21	0ed5d3c849	finished histogram.cu	2023-03-09 21:28:37 +01:00
amdsc21	f67e7de7ef	finished communicator.cu	2023-03-09 21:02:48 +01:00
amdsc21	5044713388	finished updater_gpu_coordinate.cu	2023-03-09 20:53:54 +01:00
amdsc21	c875f0425f	finished rank_metric.cu	2023-03-09 20:48:31 +01:00
amdsc21	4fd08b6c32	finished survival_metric.cu	2023-03-09 20:41:52 +01:00
amdsc21	b9d86d44d6	finish multiclass_metric.cu	2023-03-09 20:37:16 +01:00
amdsc21	a56055225a	fix auc.cu	2023-03-09 20:29:38 +01:00
amdsc21	6eba0a56ec	fix CMakeLists.txt	2023-03-09 18:57:14 +01:00
amdsc21	00c24a58b1	finish elementwise_metric.cu	2023-03-08 22:50:07 +01:00
amdsc21	6fa248b75f	try elementwise_metric.cu	2023-03-08 22:42:48 +01:00
amdsc21	946f9e9802	fix gbtree.cc	2023-03-08 21:44:20 +01:00
amdsc21	4c4e5af29c	port elementwise_metric.cu	2023-03-08 21:39:56 +01:00
amdsc21	7e1b06417b	finish gbtree.cu porting	2023-03-08 21:09:56 +01:00
amdsc21	cdd7794641	add unused option	2023-03-08 20:37:53 +01:00
amdsc21	cd743a1ae9	fix DispatchRadixSort	2023-03-08 20:31:23 +01:00
amdsc21	a45005863b	fix DispatchScan	2023-03-08 20:15:33 +01:00
amdsc21	bdcb036592	add context.hip	2023-03-08 07:34:19 +01:00
amdsc21	7a3a9b682a	add device_helpers.hip.h	2023-03-08 07:18:33 +01:00
amdsc21	0a711662c3	add device_helpers.hip.h	2023-03-08 07:10:32 +01:00
amdsc21	312e58ec99	enable rocm, fix common.h	2023-03-08 06:45:03 +01:00
amdsc21	ca8f4e7993	enable rocm, fix stats.cuh	2023-03-08 06:43:06 +01:00
amdsc21	60795f22de	enable rocm, fix linalg_op.cuh	2023-03-08 06:42:20 +01:00
amdsc21	05fdca893f	enable rocm, fix cuda_pinned_allocator.h	2023-03-08 06:39:40 +01:00
amdsc21	d8cc93f3f2	enable rocm, fix algorithm.cuh	2023-03-08 06:38:35 +01:00
amdsc21	62c4efac51	enable rocm, fix transform.h	2023-03-08 06:37:34 +01:00
amdsc21	ba9e00d911	enable rocm, fix hist_util.cuh	2023-03-08 06:36:15 +01:00
amdsc21	d3be67ad8e	enable rocm, fix quantile.cuh	2023-03-08 06:32:09 +01:00
amdsc21	2eb0b6aae4	enable rocm, fix threading_utils.cuh	2023-03-08 06:30:52 +01:00
amdsc21	327f1494f1	enable rocm, fix cuda_context.cuh	2023-03-08 06:29:45 +01:00
amdsc21	fa92aa56ee	enable rocm, fix device_adapter.cuh	2023-03-08 06:26:31 +01:00
amdsc21	427f6c2a1a	enable rocm, fix simple_dmatrix.cuh	2023-03-08 06:24:34 +01:00
amdsc21	270c7b4802	enable rocm, fix row_partitioner.cuh	2023-03-08 06:22:25 +01:00
amdsc21	0fc1f640a9	enable rocm, fix nccl_device_communicator.cuh	2023-03-08 06:18:13 +01:00
amdsc21	762fd9028d	enable rocm, fix device_communicator_adapter.cuh	2023-03-08 06:13:29 +01:00
amdsc21	f2009533e1	rm hip.h	2023-03-08 06:04:01 +01:00
amdsc21	53b5cd73f2	add hip flags	2023-03-08 03:42:51 +01:00
amdsc21	52b05d934e	add hip	2023-03-08 03:32:19 +01:00
amdsc21	840f15209c	add HIP flags, common	2023-03-08 03:11:49 +01:00
amdsc21	1e1c7fd8d5	add HIP flags, c_api	2023-03-08 01:34:37 +01:00
amdsc21	f5f800c80d	add HIP flags	2023-03-08 01:33:38 +01:00
amdsc21	6b7be96373	add HIP flags	2023-03-08 01:22:25 +01:00
amdsc21	75712b9c3c	enable HIP flags	2023-03-08 01:10:07 +01:00
amdsc21	ed45aa2816	Merge branch 'master' into dev-hui	2023-03-08 00:39:33 +01:00
amdsc21	f286ae5bfa	add hip rocthrust hipcub	2023-03-07 06:35:00 +01:00
amdsc21	f13a7f8d91	add submodules	2023-03-07 05:44:24 +01:00
amdsc21	c51a1c9aae	rename hip.cc to hip	2023-03-07 05:39:53 +01:00
amdsc21	30de728631	fix hip.cc	2023-03-07 05:11:42 +01:00
amdsc21	75fa15b36d	add hip support	2023-03-07 04:02:49 +01:00
amdsc21	eb30cb6293	add hip support	2023-03-07 03:49:52 +01:00
amdsc21	cafbfce51f	add hip.h	2023-03-07 03:46:26 +01:00
amdsc21	6039a71e6c	add hip structure	2023-03-07 02:17:19 +01:00
				`@ -0,0 +1 @@`
				`Subproject commit 2fea6734e83cf147c1bbe580ac4713cd50abcad5`