Update change_scala_version.py to also change scala.version property (#9897 )

Bump version to 2.0.3 (#9895 )
[jvm-packages] Fix POM for xgboost-jvm metapackage (#9893 )
2023-12-18 23:49:41 -08:00 · 2023-12-14 17:54:05 -08:00 · 2023-12-14 16:50:34 -08:00 · 2023-12-13 16:05:40 -08:00 · 2023-12-13 14:20:30 +08:00 · 2023-12-13 14:20:04 +08:00
78 changed files with 1559 additions and 629 deletions
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@ -51,14 +51,14 @@ jobs:
      id: extract_branch
      if: |
        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
-        matrix.os == 'windows-latest'
+        (matrix.os == 'windows-latest' || matrix.os == 'macos-11')
    - name: Publish artifact xgboost4j.dll to S3
      run: |
        cd lib/
        Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
        dir
-        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
+        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
      if: |
        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
        matrix.os == 'windows-latest'
@ -66,6 +66,19 @@ jobs:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
    - name: Publish artifact libxgboost4j.dylib to S3
      run: |
        cd lib/
        mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
        ls
        python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
      if: |
        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
        matrix.os == 'macos-11'
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
    - name: Test XGBoost4J (Core, Spark, Examples)
      run: |
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@ -25,7 +25,7 @@ jobs:
      with:
        submodules: 'true'
-    - uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1
+    - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
      with:
        r-version: ${{ matrix.config.r }}
@ -64,7 +64,7 @@ jobs:
      with:
        submodules: 'true'
-    - uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1
+    - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
      with:
        r-version: ${{ matrix.config.r }}
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -32,4 +32,3 @@ formats:
 python:
  install:
   - requirements: doc/requirements.txt
  system_packages: true
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 2.0.0)
+project(xgboost LANGUAGES CXX C VERSION 2.0.3)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
@ -233,6 +233,11 @@ endif (RABIT_BUILD_MPI)
 add_subdirectory(${xgboost_SOURCE_DIR}/src)
 target_link_libraries(objxgboost PUBLIC dmlc)
 # Link -lstdc++fs for GCC 8.x
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
  target_link_libraries(objxgboost PUBLIC stdc++fs)
 endif()
 # Exports some R specific definitions and objects
 if (R_LIB)
  add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 2.0.0.1
+Version: 2.0.3.1
-Date: 2022-10-18
+Date: 2023-12-14
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
         email = "tianqi.tchen@gmail.com"),
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
        i == env$begin_iteration ||
        i == env$end_iteration) {
      stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- format.eval.string(i, env$bst_evaluation, stdev)
+      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
      cat(msg, '\n')
    }
  }
@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
    if ((maximize && score > best_score) ||
        (!maximize && score < best_score)) {
-      best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
+      best_msg <<- .format_eval_string(
        i, env$bst_evaluation, env$bst_evaluation_err
      )
      best_score <<- score
      best_iteration <<- i
      best_ntreelimit <<- best_iteration * env$num_parallel_tree
@ -754,7 +756,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
 #
 # Format the evaluation metric string
-format.eval.string <- function(iter, eval_res, eval_err = NULL) {
+.format_eval_string <- function(iter, eval_res, eval_err = NULL) {
  if (length(eval_res) == 0)
    stop('no evaluation results')
  enames <- names(eval_res)
--- a/R-package/configure
+++ b/R-package/configure
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for xgboost 2.0.0.
+# Generated by GNU Autoconf 2.71 for xgboost 2.0.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='xgboost'
 PACKAGE_TARNAME='xgboost'
-PACKAGE_VERSION='2.0.0'
+PACKAGE_VERSION='2.0.3'
-PACKAGE_STRING='xgboost 2.0.0'
+PACKAGE_STRING='xgboost 2.0.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems.
+\`configure' configures xgboost 2.0.3 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1287,7 +1287,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of xgboost 2.0.0:";;
+     short | recursive ) echo "Configuration of xgboost 2.0.3:";;
   esac
  cat <<\_ACEOF
@ -1367,7 +1367,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-xgboost configure 2.0.0
+xgboost configure 2.0.3
 generated by GNU Autoconf 2.71
 Copyright (C) 2021 Free Software Foundation, Inc.
@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by xgboost $as_me 2.0.0, which was
+It was created by xgboost $as_me 2.0.3, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  $ $0$ac_configure_args_raw
@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by xgboost $as_me 2.0.0, which was
+This file was extended by xgboost $as_me 2.0.3, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-xgboost config.status 2.0.0
+xgboost config.status 2.0.3
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@ -2,7 +2,7 @@
 AC_PREREQ(2.69)
-AC_INIT([xgboost],[2.0.0],[],[xgboost],[])
+AC_INIT([xgboost],[2.0.3],[],[xgboost],[])
 : ${R_HOME=`R RHOME`}
 if test -z "${R_HOME}"; then
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@ -120,11 +120,25 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
  ctx.nthread = asInteger(n_threads);
  std::int32_t threads = ctx.Threads();
-  xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
+  if (is_int) {
-    for (size_t j = 0; j < ncol; ++j) {
+    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
-      data[i * ncol + j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
+      for (size_t j = 0; j < ncol; ++j) {
-    }
+        auto v = iin[i + nrow * j];
-  });
+        if (v == NA_INTEGER) {
          data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
        } else {
          data[i * ncol + j] = static_cast<float>(v);
        }
      }
    });
  } else {
    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
      for (size_t j = 0; j < ncol; ++j) {
        data[i * ncol + j] = din[i + nrow * j];
      }
    });
  }
  DMatrixHandle handle;
  CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
                                        asReal(missing), &handle, threads));
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@ -56,6 +56,42 @@ test_that("xgb.DMatrix: basic construction", {
  expect_equal(raw_fd, raw_dgc)
 })
 test_that("xgb.DMatrix: NA", {
  n_samples <- 3
  x <- cbind(
    x1 = sample(x = 4, size = n_samples, replace = TRUE),
    x2 = sample(x = 4, size = n_samples, replace = TRUE)
  )
  x[1, "x1"] <- NA
  m <- xgb.DMatrix(x)
  xgb.DMatrix.save(m, "int.dmatrix")
  x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
  colnames(x) <- c("x1", "x2")
  m <- xgb.DMatrix(x)
  xgb.DMatrix.save(m, "float.dmatrix")
  iconn <- file("int.dmatrix", "rb")
  fconn <- file("float.dmatrix", "rb")
  expect_equal(file.size("int.dmatrix"), file.size("float.dmatrix"))
  bytes <- file.size("int.dmatrix")
  idmatrix <- readBin(iconn, "raw", n = bytes)
  fdmatrix <- readBin(fconn, "raw", n = bytes)
  expect_equal(length(idmatrix), length(fdmatrix))
  expect_equal(idmatrix, fdmatrix)
  close(iconn)
  close(fconn)
  file.remove("int.dmatrix")
  file.remove("float.dmatrix")
 })
 test_that("xgb.DMatrix: saving, loading", {
  # save to a local file
  dtest1 <- xgb.DMatrix(test_data, label = test_label)
--- a/dev/change_scala_version.py
+++ b/dev/change_scala_version.py
@ -0,0 +1,79 @@
 import argparse
 import pathlib
 import re
 import shutil
 def main(args):
    if args.scala_version == "2.12":
        scala_ver = "2.12"
        scala_patchver = "2.12.18"
    elif args.scala_version == "2.13":
        scala_ver = "2.13"
        scala_patchver = "2.13.11"
    else:
        raise ValueError(f"Unsupported Scala version: {args.scala_version}")
    # Clean artifacts
    if args.purge_artifacts:
        for target in pathlib.Path("jvm-packages/").glob("**/target"):
            if target.is_dir():
                print(f"Removing {target}...")
                shutil.rmtree(target)
    # Update pom.xml
    for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):
        print(f"Updating {pom}...")
        with open(pom, "r", encoding="utf-8") as f:
            lines = f.readlines()
        with open(pom, "w", encoding="utf-8") as f:
            replaced_scalaver = False
            replaced_scala_binver = False
            for line in lines:
                for artifact in [
                    "xgboost-jvm",
                    "xgboost4j",
                    "xgboost4j-gpu",
                    "xgboost4j-spark",
                    "xgboost4j-spark-gpu",
                    "xgboost4j-flink",
                    "xgboost4j-example",
                ]:
                    line = re.sub(
                        f"<artifactId>{artifact}_[0-9\\.]*",
                        f"<artifactId>{artifact}_{scala_ver}",
                        line,
                    )
                # Only replace the first occurrence of scala.version
                if not replaced_scalaver:
                    line, nsubs = re.subn(
                        r"<scala.version>[0-9\.]*",
                        f"<scala.version>{scala_patchver}",
                        line,
                    )
                    if nsubs > 0:
                        replaced_scalaver = True
                # Only replace the first occurrence of scala.binary.version
                if not replaced_scala_binver:
                    line, nsubs = re.subn(
                        r"<scala.binary.version>[0-9\.]*",
                        f"<scala.binary.version>{scala_ver}",
                        line,
                    )
                    if nsubs > 0:
                        replaced_scala_binver = True
                f.write(line)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--purge-artifacts", action="store_true")
    parser.add_argument(
        "--scala-version",
        type=str,
        required=True,
        help="Version of Scala to use in the JVM packages",
        choices=["2.12", "2.13"],
    )
    parsed_args = parser.parse_args()
    main(parsed_args)
--- a/dev/prepare_jvm_release.py
+++ b/dev/prepare_jvm_release.py
@ -2,7 +2,6 @@ import argparse
 import errno
 import glob
 import os
 import platform
 import re
 import shutil
 import subprocess
@ -21,12 +20,14 @@ def normpath(path):
    else:
        return normalized
 def cp(source, target):
    source = normpath(source)
    target = normpath(target)
    print("cp {0} {1}".format(source, target))
    shutil.copy(source, target)
 def maybe_makedirs(path):
    path = normpath(path)
    print("mkdir -p " + path)
@ -36,6 +37,7 @@ def maybe_makedirs(path):
        if e.errno != errno.EEXIST:
            raise
@contextmanager
 def cd(path):
    path = normpath(path)
@ -47,18 +49,22 @@ def cd(path):
    finally:
        os.chdir(cwd)
 def run(command, **kwargs):
    print(command)
    subprocess.check_call(command, shell=True, **kwargs)
 def get_current_git_tag():
    out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"])
    return out.decode().split("\n")[0]
 def get_current_commit_hash():
    out = subprocess.check_output(["git", "rev-parse", "HEAD"])
    return out.decode().split("\n")[0]
 def get_current_git_branch():
    out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"])
    m = re.search(r"release_[0-9\.]+", out.decode())
@ -66,38 +72,49 @@ def get_current_git_branch():
        raise ValueError("Expected branch name of form release_xxx")
    return m.group(0)
 def retrieve(url, filename=None):
    print(f"{url} -> {filename}")
    return urlretrieve(url, filename)
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--release-version", type=str, required=True,
+    parser.add_argument(
-                        help="Version of the release being prepared")
+        "--release-version",
        type=str,
        required=True,
        help="Version of the release being prepared",
    )
    args = parser.parse_args()
    if sys.platform != "darwin" or platform.machine() != "x86_64":
        raise NotImplementedError("Please run this script using an Intel Mac")
    version = args.release_version
    expected_git_tag = "v" + version
    current_git_tag = get_current_git_tag()
    if current_git_tag != expected_git_tag:
        if not current_git_tag:
-            raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
+            raise ValueError(
-                             f"Run: git checkout {expected_git_tag}")
+                f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
-        raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD is at tag "
+                f"Run: git checkout {expected_git_tag}"
-                         f"{current_git_tag}. Run: git checkout {expected_git_tag}")
+            )
        raise ValueError(
            f"Expected git tag {expected_git_tag} but current HEAD is at tag "
            f"{current_git_tag}. Run: git checkout {expected_git_tag}"
        )
    commit_hash = get_current_commit_hash()
    git_branch = get_current_git_branch()
-    print(f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}")
+    print(
        f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}"
    )
    with cd("jvm-packages/"):
        print("====copying pure-Python tracker====")
        for use_cuda in [True, False]:
            xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
-            cp("../python-package/xgboost/tracker.py", f"{xgboost4j}/src/main/resources")
+            cp(
                "../python-package/xgboost/tracker.py",
                f"{xgboost4j}/src/main/resources",
            )
        print("====copying resources for testing====")
        with cd("../demo/CLI/regression"):
@ -115,7 +132,12 @@ def main():
                cp(file, f"{xgboost4j_spark}/src/test/resources")
        print("====Creating directories to hold native binaries====")
-        for os_ident, arch in [("linux", "x86_64"), ("windows", "x86_64"), ("macos", "x86_64")]:
+        for os_ident, arch in [
            ("linux", "x86_64"),
            ("windows", "x86_64"),
            ("macos", "x86_64"),
            ("macos", "aarch64"),
        ]:
            output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
            maybe_makedirs(output_dir)
        for os_ident, arch in [("linux", "x86_64")]:
@ -123,52 +145,98 @@ def main():
            maybe_makedirs(output_dir)
        print("====Downloading native binaries from CI====")
-        nightly_bucket_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
+        nightly_bucket_prefix = (
-        maven_repo_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
+            "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
        )
        maven_repo_prefix = (
            "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
        )
-        retrieve(url=f"{nightly_bucket_prefix}/{git_branch}/xgboost4j_{commit_hash}.dll",
+        retrieve(
-                 filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll")
+            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll",
            filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
        )
        retrieve(
            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
            filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
        )
        retrieve(
            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
            filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
        )
        with tempfile.TemporaryDirectory() as tempdir:
            # libxgboost4j.so for Linux x86_64, CPU only
            zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar")
            extract_dir = os.path.join(tempdir, "xgboost4j")
-            retrieve(url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
+            retrieve(
-                         f"xgboost4j_2.12-{version}.jar",
+                url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
-                     filename=zip_path)
+                f"xgboost4j_2.12-{version}.jar",
                filename=zip_path,
            )
            os.mkdir(extract_dir)
            with zipfile.ZipFile(zip_path, "r") as t:
                t.extractall(extract_dir)
-            cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
+            cp(
-               "xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
+                os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
                "xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
            )
            # libxgboost4j.so for Linux x86_64, GPU support
            zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
            extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
-            retrieve(url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
+            retrieve(
-                         f"xgboost4j-gpu_2.12-{version}.jar",
+                url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
-                     filename=zip_path)
+                f"xgboost4j-gpu_2.12-{version}.jar",
                filename=zip_path,
            )
            os.mkdir(extract_dir)
            with zipfile.ZipFile(zip_path, "r") as t:
                t.extractall(extract_dir)
-            cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
+            cp(
-               "xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
+                os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
-
+                "xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
            )
    print("====Next Steps====")
    print("1. Gain upload right to Maven Central repo.")
    print("1-1. Sign up for a JIRA account at Sonatype: ")
-    print("1-2. File a JIRA ticket: "
+    print(
-          "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
+        "1-2. File a JIRA ticket: "
-          "https://issues.sonatype.org/browse/OSSRH-67724")
+        "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
-    print("2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
+        "https://issues.sonatype.org/browse/OSSRH-67724"
-          "https://central.sonatype.org/publish/publish-maven/")
+    )
-    print("3. Now on a Mac machine, run:")
+    print(
-    print("   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests")
+        "2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
-    print("4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
+        "https://central.sonatype.org/publish/publish-maven/"
-          "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 "
+    )
-          "to inspect the staged JAR files. Finally, press Release button to publish the "
+    print(
-          "artifacts to the Maven Central repository.")
+        "3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
        "Make sure to use an Internet connection with fast upload speed:"
    )
    print(
        "   # Skip native build, since we have all needed native binaries from CI\n"
        "   export MAVEN_SKIP_NATIVE_BUILD=1\n"
        "   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
    )
    print(
        "4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
        "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
        "to inspect the staged JAR files. Finally, press Release button to publish the "
        "artifacts to the Maven Central repository. The top-level metapackage should be "
        "named xgboost-jvm_2.12."
    )
    print(
        "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
        "   export MAVEN_SKIP_NATIVE_BUILD=1\n"
        "   python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
        "   GPG_TTY=$(tty) mvn deploy -Prelease-cpu-only,scala-2.13 -DskipTests"
    )
    print(
        "6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
        "The top-level metapackage should be named xgboost-jvm_2.13."
    )
 if __name__ == "__main__":
    main()
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -329,7 +329,7 @@ Parameters for Linear Booster (``booster=gblinear``)
  - Choice of algorithm to fit linear model
    - ``shotgun``: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run.
-    - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution.
+    - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the ``device`` parameter is set to ``cuda`` or ``gpu``, a GPU variant would be used.
 * ``feature_selector`` [default= ``cyclic``]
--- a/include/xgboost/global_config.h
+++ b/include/xgboost/global_config.h
@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2020 by Contributors
+ * Copyright 2020-2023, XGBoost Contributors
 * \file global_config.h
 * \brief Global configuration for XGBoost
 * \author Hyunsu Cho
@ -7,24 +7,22 @@
 #ifndef XGBOOST_GLOBAL_CONFIG_H_
 #define XGBOOST_GLOBAL_CONFIG_H_
-#include <xgboost/parameter.h>
+#include <dmlc/thread_local.h>  // for ThreadLocalStore
-#include <vector>
+#include <xgboost/parameter.h>  // for XGBoostParameter
-#include <string>
+
 #include <cstdint>  // for int32_t
 namespace xgboost {
 class Json;
 struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
-  int verbosity { 1 };
+  std::int32_t verbosity{1};
-  bool use_rmm { false };
+  bool use_rmm{false};
  DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
    DMLC_DECLARE_FIELD(verbosity)
        .set_range(0, 3)
        .set_default(1)  // shows only warning
        .describe("Flag to print out detailed breakdown of runtime.");
-    DMLC_DECLARE_FIELD(use_rmm)
+    DMLC_DECLARE_FIELD(use_rmm).set_default(false).describe(
-        .set_default(false)
+        "Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
        .describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
  }
 };
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@ -6,6 +6,6 @@
 #define XGBOOST_VER_MAJOR 2  /* NOLINT */
 #define XGBOOST_VER_MINOR 0  /* NOLINT */
-#define XGBOOST_VER_PATCH 0  /* NOLINT */
+#define XGBOOST_VER_PATCH 3  /* NOLINT */
 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@ -25,4 +25,3 @@ target_include_directories(xgboost4j
  ${PROJECT_SOURCE_DIR}/rabit/include)
 set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
 target_link_libraries(xgboost4j PRIVATE ${JAVA_JVM_LIBRARY})
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import errno
 import argparse
 import errno
 import glob
 import os
 import platform
@ -19,11 +19,10 @@ CONFIG = {
    "USE_HDFS": "OFF",
    "USE_AZURE": "OFF",
    "USE_S3": "OFF",
    "USE_CUDA": "OFF",
    "USE_NCCL": "OFF",
    "JVM_BINDINGS": "ON",
-    "LOG_CAPI_INVOCATION": "OFF"
+    "LOG_CAPI_INVOCATION": "OFF",
 }
@ -70,26 +69,22 @@ def normpath(path):
        return normalized
-if __name__ == "__main__":
+def native_build(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
    parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
    cli_args = parser.parse_args()
    if sys.platform == "darwin":
        # Enable of your compiler supports OpenMP.
        CONFIG["USE_OPENMP"] = "OFF"
-        os.environ["JAVA_HOME"] = subprocess.check_output(
+        os.environ["JAVA_HOME"] = (
-            "/usr/libexec/java_home").strip().decode()
+            subprocess.check_output("/usr/libexec/java_home").strip().decode()
        )
    print("building Java wrapper")
    with cd(".."):
-        build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build'
+        build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
        maybe_makedirs(build_dir)
        with cd(build_dir):
            if sys.platform == "win32":
                # Force x64 build on Windows.
-                maybe_generator = ' -A x64'
+                maybe_generator = " -A x64"
            else:
                maybe_generator = ""
            if sys.platform == "linux":
@ -97,12 +92,12 @@ if __name__ == "__main__":
            else:
                maybe_parallel_build = ""
-            if cli_args.log_capi_invocation == 'ON':
+            if cli_args.log_capi_invocation == "ON":
-                CONFIG['LOG_CAPI_INVOCATION'] = 'ON'
+                CONFIG["LOG_CAPI_INVOCATION"] = "ON"
-            if cli_args.use_cuda == 'ON':
+            if cli_args.use_cuda == "ON":
-                CONFIG['USE_CUDA'] = 'ON'
+                CONFIG["USE_CUDA"] = "ON"
-                CONFIG['USE_NCCL'] = 'ON'
+                CONFIG["USE_NCCL"] = "ON"
            args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
@ -115,7 +110,7 @@ if __name__ == "__main__":
            if gpu_arch_flag is not None:
                args.append("%s" % gpu_arch_flag)
-            lib_dir = os.path.join(os.pardir, 'lib')
+            lib_dir = os.path.join(os.pardir, "lib")
            if os.path.exists(lib_dir):
                shutil.rmtree(lib_dir)
            run("cmake .. " + " ".join(args) + maybe_generator)
@ -125,8 +120,10 @@ if __name__ == "__main__":
            run(f'"{sys.executable}" mapfeat.py')
            run(f'"{sys.executable}" mknfold.py machine.txt 1')
-    xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j'
+    xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
-    xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark'
+    xgboost4j_spark = (
        "xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
    )
    print("copying native library")
    library_name, os_folder = {
@ -141,14 +138,19 @@ if __name__ == "__main__":
        "i86pc": "x86_64",  # on Solaris x86_64
        "sun4v": "sparc",  # on Solaris sparc
        "arm64": "aarch64",  # on macOS & Windows ARM 64-bit
-        "aarch64": "aarch64"
+        "aarch64": "aarch64",
    }[platform.machine().lower()]
-    output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder)
+    output_folder = "{}/src/main/resources/lib/{}/{}".format(
        xgboost4j, os_folder, arch_folder
    )
    maybe_makedirs(output_folder)
    cp("../lib/" + library_name, output_folder)
    print("copying pure-Python tracker")
-    cp("../python-package/xgboost/tracker.py", "{}/src/main/resources".format(xgboost4j))
+    cp(
        "../python-package/xgboost/tracker.py",
        "{}/src/main/resources".format(xgboost4j),
    )
    print("copying train/test files")
    maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
@ -164,3 +166,18 @@ if __name__ == "__main__":
    maybe_makedirs("{}/src/test/resources".format(xgboost4j))
    for file in glob.glob("../demo/data/agaricus.*"):
        cp(file, "{}/src/test/resources".format(xgboost4j))
 if __name__ == "__main__":
    if "MAVEN_SKIP_NATIVE_BUILD" in os.environ:
        print("MAVEN_SKIP_NATIVE_BUILD is set. Skipping native build...")
    else:
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "--log-capi-invocation", type=str, choices=["ON", "OFF"], default="OFF"
        )
        parser.add_argument(
            "--use-cuda", type=str, choices=["ON", "OFF"], default="OFF"
        )
        cli_args = parser.parse_args()
        native_build(cli_args)
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -5,8 +5,8 @@
    <modelVersion>4.0.0</modelVersion>
    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost-jvm</artifactId>
+    <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.0.3</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
@ -189,6 +189,93 @@
                </plugins>
            </build>
        </profile>
        <profile>
            <id>release-cpu-only</id>
            <modules>
                <module>xgboost4j</module>
                <module>xgboost4j-example</module>
                <module>xgboost4j-spark</module>
                <module>xgboost4j-flink</module>
            </modules>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-jar-plugin</artifactId>
                        <version>3.3.0</version>
                        <executions>
                            <execution>
                                <id>empty-javadoc-jar</id>
                                <phase>package</phase>
                                <goals>
                                    <goal>jar</goal>
                                </goals>
                                <configuration>
                                    <classifier>javadoc</classifier>
                                    <classesDirectory>${basedir}/javadoc</classesDirectory>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-release-plugin</artifactId>
                        <version>3.0.1</version>
                        <configuration>
                            <autoVersionSubmodules>true</autoVersionSubmodules>
                            <useReleaseProfile>false</useReleaseProfile>
                            <releaseProfiles>release</releaseProfiles>
                            <goals>deploy</goals>
                        </configuration>
                    </plugin>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-gpg-plugin</artifactId>
                        <version>3.1.0</version>
                        <executions>
                            <execution>
                                <id>sign-artifacts</id>
                                <phase>verify</phase>
                                <goals>
                                    <goal>sign</goal>
                                </goals>
                            </execution>
                        </executions>
                    </plugin>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-source-plugin</artifactId>
                        <version>3.3.0</version>
                        <executions>
                            <execution>
                                <id>attach-sources</id>
                                <goals>
                                    <goal>jar-no-fork</goal>
                                </goals>
                            </execution>
                        </executions>
                    </plugin>
                    <plugin>
                        <groupId>org.sonatype.plugins</groupId>
                        <artifactId>nexus-staging-maven-plugin</artifactId>
                        <version>1.6.13</version>
                        <extensions>true</extensions>
                        <configuration>
                            <serverId>ossrh</serverId>
                            <nexusUrl>https://oss.sonatype.org/</nexusUrl>
                            <autoReleaseAfterClose>false</autoReleaseAfterClose>
                        </configuration>
                    </plugin>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-surefire-plugin</artifactId>
                        <configuration>
                            <skipTests>true</skipTests>
                        </configuration>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>assembly</id>
            <build>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@ -5,12 +5,12 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-example</name>
-    <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.0.3</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@ -26,7 +26,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-spark_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
@ -37,7 +37,7 @@
        </dependency>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-flink_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
    </dependencies>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@ -5,13 +5,13 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-flink</name>
-    <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.0.3</version>
    <properties>
      <flink-ml.version>2.2.0</flink-ml.version>
    </properties>
@ -30,7 +30,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -5,12 +5,12 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.0.3</version>
    </parent>
-    <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-gpu_2.12</artifactId>
    <name>xgboost4j-gpu</name>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.0.3</version>
    <packaging>jar</packaging>
    <dependencies>
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@ -5,11 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-spark-gpu</name>
-    <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
        <plugins>
            <plugin>
@ -24,7 +24,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-gpu_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@ -5,11 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j-spark</name>
-    <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
        <plugins>
            <plugin>
@ -24,7 +24,7 @@
    <dependencies>
        <dependency>
            <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j_2.12</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -5,12 +5,12 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.0.3</version>
    </parent>
    <name>xgboost4j</name>
-    <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j_2.12</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.0.3</version>
    <packaging>jar</packaging>
    <dependencies>
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@ -132,16 +132,28 @@ def locate_or_build_libxgboost(
    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
+        sys_prefix = pathlib.Path(sys.base_prefix)
-        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
+        sys_prefix_candidates = [
-        if not libxgboost_sys.exists():
+            sys_prefix / "lib",
-            raise RuntimeError(
+            # Paths possibly used on Windows
-                f"use_system_libxgboost was specified but {_lib_name()} is "
+            sys_prefix / "bin",
-                f"not found in {libxgboost_sys.parent}"
+            sys_prefix / "Library",
-            )
+            sys_prefix / "Library" / "bin",
-
+            sys_prefix / "Library" / "lib",
-        logger.info("Using system XGBoost: %s", str(libxgboost_sys))
+        ]
-        return libxgboost_sys
+        sys_prefix_candidates = [
            p.expanduser().resolve() for p in sys_prefix_candidates
        ]
        for candidate_dir in sys_prefix_candidates:
            libtreelite_sys = candidate_dir / _lib_name()
            if libtreelite_sys.exists():
                logger.info("Using system XGBoost: %s", str(libtreelite_sys))
                return libtreelite_sys
        raise RuntimeError(
            f"use_system_libxgboost was specified but {_lib_name()} is "
            f"not found. Paths searched (in order): \n"
            + "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
        )
    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
    if libxgboost is not None:
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@ -7,7 +7,7 @@ build-backend = "packager.pep517"
 [project]
 name = "xgboost"
-version = "2.0.0-dev"
+version = "2.0.3"
 authors = [
    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@ -1 +1 @@
-2.0.0-dev
+2.0.3
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@ -88,6 +88,18 @@ def is_cudf_available() -> bool:
        return False
 def is_cupy_available() -> bool:
    """Check cupy package available or not"""
    if importlib.util.find_spec("cupy") is None:
        return False
    try:
        import cupy
        return True
    except ImportError:
        return False
 try:
    import scipy.sparse as scipy_sparse
    from scipy.sparse import csr_matrix as scipy_csr
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -206,6 +206,7 @@ def _load_lib() -> ctypes.CDLL:
            lib = ctypes.cdll.LoadLibrary(lib_path)
            setattr(lib, "path", os.path.normpath(lib_path))
            lib_success = True
            break
        except OSError as e:
            os_error_list.append(str(e))
            continue
@ -2399,6 +2400,7 @@ class Booster:
            _is_cudf_df,
            _is_cupy_array,
            _is_list,
            _is_np_array_like,
            _is_pandas_df,
            _is_pandas_series,
            _is_tuple,
@ -2428,7 +2430,7 @@ class Booster:
                    f"got {data.shape[1]}"
                )
-        if isinstance(data, np.ndarray):
+        if _is_np_array_like(data):
            from .data import _ensure_np_dtype
            data, _ = _ensure_np_dtype(data, data.dtype)
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@ -78,7 +78,6 @@ from .data import _is_cudf_ser, _is_cupy_array
 from .sklearn import (
    XGBClassifier,
    XGBClassifierBase,
    XGBClassifierMixIn,
    XGBModel,
    XGBRanker,
    XGBRankerMixIn,
@ -1854,7 +1853,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
    "Implementation of the scikit-learn API for XGBoost classification.",
    ["estimators", "model"],
 )
-class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase):
+class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
    # pylint: disable=missing-class-docstring
    async def _fit_async(
        self,
@ -2036,10 +2035,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
            preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
        return preds
    def load_model(self, fname: ModelIn) -> None:
        super().load_model(fname)
        self._load_model_attributes(self.get_booster())
@xgboost_model_doc(
    """Implementation of the Scikit-Learn API for XGBoost Ranking.
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -164,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
    return isinstance(data, scipy.sparse.coo_matrix)
-def _is_numpy_array(data: DataType) -> bool:
+def _is_np_array_like(data: DataType) -> bool:
-    return isinstance(data, (np.ndarray, np.matrix))
+    return hasattr(data, "__array_interface__")
 def _ensure_np_dtype(
@ -317,7 +317,6 @@ def pandas_feature_info(
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
    """Handle feature info for pandas dataframe."""
    import pandas as pd
    from pandas.api.types import is_categorical_dtype, is_sparse
    # handle feature names
    if feature_names is None and meta is None:
@ -332,10 +331,10 @@ def pandas_feature_info(
    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data.dtypes:
-            if is_sparse(dtype):
+            if is_pd_sparse_dtype(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif (
-                is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
            ) and enable_categorical:
                feature_types.append(CAT_T)
            else:
@ -345,18 +344,13 @@ def pandas_feature_info(
 def is_nullable_dtype(dtype: PandasDType) -> bool:
    """Whether dtype is a pandas nullable type."""
-    from pandas.api.types import (
+    from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
        is_bool_dtype,
        is_categorical_dtype,
        is_float_dtype,
        is_integer_dtype,
    )
    is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
    is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
-    return is_int or is_bool or is_float or is_categorical_dtype(dtype)
+    return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
 def is_pa_ext_dtype(dtype: Any) -> bool:
@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
    )
 def is_pd_cat_dtype(dtype: PandasDType) -> bool:
    """Wrapper for testing pandas category type."""
    import pandas as pd
    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
        Version = pd.util.version.Version
        if Version(pd.__version__) >= Version("2.1.0"):
            from pandas import CategoricalDtype
            return isinstance(dtype, CategoricalDtype)
    from pandas.api.types import is_categorical_dtype
    return is_categorical_dtype(dtype)
 def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
    """Wrapper for testing pandas sparse type."""
    import pandas as pd
    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
        Version = pd.util.version.Version
        if Version(pd.__version__) >= Version("2.1.0"):
            from pandas import SparseDtype
            return isinstance(dtype, SparseDtype)
    from pandas.api.types import is_sparse
    return is_sparse(dtype)
 def pandas_cat_null(data: DataFrame) -> DataFrame:
    """Handle categorical dtype and nullable extension types from pandas."""
    import pandas as pd
    from pandas.api.types import is_categorical_dtype
    # handle category codes and nullable.
    cat_columns = []
    nul_columns = []
    # avoid an unnecessary conversion if possible
    for col, dtype in zip(data.columns, data.dtypes):
-        if is_categorical_dtype(dtype):
+        if is_pd_cat_dtype(dtype):
            cat_columns.append(col)
        elif is_pa_ext_categorical_dtype(dtype):
            raise ValueError(
@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
        transformed = data
    def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_categorical_dtype(ser.dtype):
+        if is_pd_cat_dtype(ser.dtype):
            return ser.cat.codes
        assert is_pa_ext_categorical_dtype(ser.dtype)
        # Not yet supported, the index is not ordered for some reason. Alternately:
@ -454,14 +479,12 @@ def _transform_pandas_df(
    meta: Optional[str] = None,
    meta_type: Optional[NumpyDType] = None,
 ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
    from pandas.api.types import is_categorical_dtype, is_sparse
    pyarrow_extension = False
    for dtype in data.dtypes:
        if not (
            (dtype.name in _pandas_dtype_mapper)
-            or is_sparse(dtype)
+            or is_pd_sparse_dtype(dtype)
-            or (is_categorical_dtype(dtype) and enable_categorical)
+            or (is_pd_cat_dtype(dtype) and enable_categorical)
            or is_pa_ext_dtype(dtype)
        ):
            _invalid_dataframe_dtype(data)
@ -515,9 +538,8 @@ def _meta_from_pandas_series(
 ) -> None:
    """Help transform pandas series for meta data like labels"""
    data = data.values.astype("float")
    from pandas.api.types import is_sparse
-    if is_sparse(data):
+    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
        data = data.to_dense()  # type: ignore
    assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
    _meta_from_numpy(data, name, dtype, handle)
@ -539,13 +561,11 @@ def _from_pandas_series(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
    from pandas.api.types import is_categorical_dtype
    if (data.dtype.name not in _pandas_dtype_mapper) and not (
-        is_categorical_dtype(data.dtype) and enable_categorical
+        is_pd_cat_dtype(data.dtype) and enable_categorical
    ):
        _invalid_dataframe_dtype(data)
-    if enable_categorical and is_categorical_dtype(data.dtype):
+    if enable_categorical and is_pd_cat_dtype(data.dtype):
        data = data.cat.codes
    return _from_numpy_array(
        data.values.reshape(data.shape[0], 1).astype("float"),
@ -1051,7 +1071,7 @@ def dispatch_data_backend(
        return _from_scipy_csr(
            data.tocsr(), missing, threads, feature_names, feature_types
        )
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        return _from_numpy_array(
            data, missing, threads, feature_names, feature_types, data_split_mode
        )
@ -1194,7 +1214,7 @@ def dispatch_meta_backend(
    if _is_tuple(data):
        _meta_from_tuple(data, name, dtype, handle)
        return
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        _meta_from_numpy(data, name, dtype, handle)
        return
    if _is_pandas_df(data):
@ -1281,7 +1301,7 @@ def _proxy_transform(
        return _transform_dlpack(data), None, feature_names, feature_types
    if _is_list(data) or _is_tuple(data):
        data = np.array(data)
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
@ -1331,7 +1351,7 @@ def dispatch_proxy_set_data(
    if not allow_host:
        raise err
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        _check_data_shape(data)
        proxy._set_data_from_array(data)  # pylint: disable=W0212
        return
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@ -31,16 +31,15 @@ def find_lib_path() -> List[str]:
    ]
    if sys.platform == "win32":
-        if platform.architecture()[0] == "64bit":
+        # On Windows, Conda may install libs in different paths
-            dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/"))
+        dll_path.extend(
-            # hack for pip installation when copy all parent source
+            [
-            # directory here
+                os.path.join(sys.base_prefix, "bin"),
-            dll_path.append(os.path.join(curr_path, "./windows/x64/Release/"))
+                os.path.join(sys.base_prefix, "Library"),
-        else:
+                os.path.join(sys.base_prefix, "Library", "bin"),
-            dll_path.append(os.path.join(curr_path, "../../windows/Release/"))
+                os.path.join(sys.base_prefix, "Library", "lib"),
-            # hack for pip installation when copy all parent source
+            ]
-            # directory here
+        )
            dll_path.append(os.path.join(curr_path, "./windows/Release/"))
        dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
    elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
        dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train
 class XGBClassifierMixIn:  # pylint: disable=too-few-public-methods
    """MixIn for classification."""
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)
    def _load_model_attributes(self, booster: Booster) -> None:
        config = json.loads(booster.save_config())
        self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
        # binary classification is treated as regression in XGBoost.
        self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
 class XGBRankerMixIn:  # pylint: disable=too-few-public-methods
    """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
    base classes.
@ -845,21 +832,38 @@ class XGBModel(XGBModelBase):
        self.get_booster().load_model(fname)
        meta_str = self.get_booster().attr("scikit_learn")
-        if meta_str is None:
+        if meta_str is not None:
-            return
+            meta = json.loads(meta_str)
            t = meta.get("_estimator_type", None)
            if t is not None and t != self._get_type():
                raise TypeError(
                    "Loading an estimator with different type. Expecting: "
                    f"{self._get_type()}, got: {t}"
                )
        meta = json.loads(meta_str)
        t = meta.get("_estimator_type", None)
        if t is not None and t != self._get_type():
            raise TypeError(
                "Loading an estimator with different type. Expecting: "
                f"{self._get_type()}, got: {t}"
            )
        self.feature_types = self.get_booster().feature_types
        self.get_booster().set_attr(scikit_learn=None)
        config = json.loads(self.get_booster().save_config())
        self._load_model_attributes(config)
    load_model.__doc__ = f"""{Booster.load_model.__doc__}"""
    def _load_model_attributes(self, config: dict) -> None:
        """Load model attributes without hyper-parameters."""
        from sklearn.base import is_classifier
        booster = self.get_booster()
        self.objective = config["learner"]["objective"]["name"]
        self.booster = config["learner"]["gradient_booster"]["name"]
        self.base_score = config["learner"]["learner_model_param"]["base_score"]
        self.feature_types = booster.feature_types
        if is_classifier(self):
            self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
            # binary classification is treated as regression in XGBoost.
            self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
    # pylint: disable=too-many-branches
    def _configure_fit(
        self,
@ -1409,7 +1413,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
        Number of boosting rounds.
 """,
 )
-class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
+class XGBClassifier(XGBModel, XGBClassifierBase):
    # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
    @_deprecate_positional_args
    def __init__(
@ -1637,10 +1641,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    def classes_(self) -> np.ndarray:
        return np.arange(self.n_classes_)
    def load_model(self, fname: ModelIn) -> None:
        super().load_model(fname)
        self._load_model_attributes(self.get_booster())
@xgboost_model_doc(
    "scikit-learn API for XGBoost random forest classification.",
@ -2093,7 +2093,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        """
        X, qid = _get_qid(X, None)
-        Xyq = DMatrix(X, y, qid=qid)
+        # fixme(jiamingy): base margin and group weight is not yet supported. We might
        # need to make extra special fields in the dataframe.
        Xyq = DMatrix(
            X,
            y,
            qid=qid,
            missing=self.missing,
            enable_categorical=self.enable_categorical,
            nthread=self.n_jobs,
            feature_types=self.feature_types,
        )
        if callable(self.eval_metric):
            metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
            result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@ -22,7 +22,7 @@ from typing import (
 import numpy as np
 import pandas as pd
-from pyspark import SparkContext, cloudpickle
+from pyspark import RDD, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@ -44,6 +44,7 @@ from pyspark.ml.util import (
    MLWritable,
    MLWriter,
 )
 from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
 from pyspark.sql import Column, DataFrame
 from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
 from pyspark.sql.types import (
@ -59,11 +60,12 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
 import xgboost
 from xgboost import XGBClassifier
-from xgboost.compat import is_cudf_available
+from xgboost.compat import is_cudf_available, is_cupy_available
 from xgboost.core import Booster, _check_distributed_params
 from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train
 from .._typing import ArrayLike
 from .data import (
    _read_csr_matrix_from_unwrapped_spark_vec,
    alias,
@ -87,6 +89,7 @@ from .utils import (
    _get_rabit_args,
    _get_spark_session,
    _is_local,
    _is_standalone_or_localcluster,
    deserialize_booster,
    deserialize_xgb_model,
    get_class_name,
@ -241,6 +244,13 @@ class _SparkXGBParams(
        TypeConverters.toList,
    )
    def set_device(self, value: str) -> "_SparkXGBParams":
        """Set device, optional value: cpu, cuda, gpu"""
        _check_distributed_params({"device": value})
        assert value in ("cpu", "cuda", "gpu")
        self.set(self.device, value)
        return self
    @classmethod
    def _xgb_cls(cls) -> Type[XGBModel]:
        """
@ -334,6 +344,54 @@ class _SparkXGBParams(
                predict_params[param.name] = self.getOrDefault(param)
        return predict_params
    def _validate_gpu_params(self) -> None:
        """Validate the gpu parameters and gpu configurations"""
        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
            ss = _get_spark_session()
            sc = ss.sparkContext
            if _is_local(sc):
                # Support GPU training in Spark local mode is just for debugging
                # purposes, so it's okay for printing the below warning instead of
                # checking the real gpu numbers and raising the exception.
                get_logger(self.__class__.__name__).warning(
                    "You have enabled GPU in spark local mode. Please make sure your"
                    " local node has at least %d GPUs",
                    self.getOrDefault(self.num_workers),
                )
            else:
                executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
                if executor_gpus is None:
                    raise ValueError(
                        "The `spark.executor.resource.gpu.amount` is required for training"
                        " on GPU."
                    )
                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
                    # require spark.task.resource.gpu.amount to be set explicitly
                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
                    if gpu_per_task is not None:
                        if float(gpu_per_task) < 1.0:
                            raise ValueError(
                                "XGBoost doesn't support GPU fractional configurations. "
                                "Please set `spark.task.resource.gpu.amount=spark.executor"
                                ".resource.gpu.amount`"
                            )
                        if float(gpu_per_task) > 1.0:
                            get_logger(self.__class__.__name__).warning(
                                "%s GPUs for each Spark task is configured, but each "
                                "XGBoost training task uses only 1 GPU.",
                                gpu_per_task,
                            )
                    else:
                        raise ValueError(
                            "The `spark.task.resource.gpu.amount` is required for training"
                            " on GPU."
                        )
    def _validate_params(self) -> None:
        # pylint: disable=too-many-branches
        init_model = self.getOrDefault("xgb_model")
@ -413,53 +471,7 @@ class _SparkXGBParams(
                    "`pyspark.ml.linalg.Vector` type."
                )
-        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+        self._validate_gpu_params()
            gpu_per_task = (
                _get_spark_session()
                .sparkContext.getConf()
                .get("spark.task.resource.gpu.amount")
            )
            is_local = _is_local(_get_spark_session().sparkContext)
            if is_local:
                # checking spark local mode.
                if gpu_per_task is not None:
                    raise RuntimeError(
                        "The spark local mode does not support gpu configuration."
                        "Please remove spark.executor.resource.gpu.amount and "
                        "spark.task.resource.gpu.amount"
                    )
                # Support GPU training in Spark local mode is just for debugging
                # purposes, so it's okay for printing the below warning instead of
                # checking the real gpu numbers and raising the exception.
                get_logger(self.__class__.__name__).warning(
                    "You have enabled GPU in spark local mode. Please make sure your"
                    " local node has at least %d GPUs",
                    self.getOrDefault(self.num_workers),
                )
            else:
                # checking spark non-local mode.
                if gpu_per_task is not None:
                    if float(gpu_per_task) < 1.0:
                        raise ValueError(
                            "XGBoost doesn't support GPU fractional configurations. "
                            "Please set `spark.task.resource.gpu.amount=spark.executor"
                            ".resource.gpu.amount`"
                        )
                    if float(gpu_per_task) > 1.0:
                        get_logger(self.__class__.__name__).warning(
                            "%s GPUs for each Spark task is configured, but each "
                            "XGBoost training task uses only 1 GPU.",
                            gpu_per_task,
                        )
                else:
                    raise ValueError(
                        "The `spark.task.resource.gpu.amount` is required for training"
                        " on GPU."
                    )
 def _validate_and_convert_feature_col_as_float_col_list(
@ -584,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            arbitrary_params_dict={},
        )
        self.logger = get_logger(self.__class__.__name__)
    def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
        """
        Set params for the estimator.
@ -886,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
        return booster_params, train_call_kwargs_params, dmatrix_kwargs
    def _skip_stage_level_scheduling(self) -> bool:
        # pylint: disable=too-many-return-statements
        """Check if stage-level scheduling is not needed,
        return true to skip stage-level scheduling"""
        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
            ss = _get_spark_session()
            sc = ss.sparkContext
            if ss.version < "3.4.0":
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark version 3.4.0+"
                )
                return True
            if not _is_standalone_or_localcluster(sc):
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark standalone or "
                    "local-cluster mode"
                )
                return True
            executor_cores = sc.getConf().get("spark.executor.cores")
            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
            if executor_cores is None or executor_gpus is None:
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark.executor.cores, "
                    "spark.executor.resource.gpu.amount to be set."
                )
                return True
            if int(executor_cores) == 1:
                # there will be only 1 task running at any time.
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
                )
                return True
            if int(executor_gpus) > 1:
                # For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
                # to make xgboost run successfully.
                #
                self.logger.info(
                    "Stage-level scheduling in xgboost will not work "
                    "when spark.executor.resource.gpu.amount>1"
                )
                return True
            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
            if task_gpu_amount is None:
                # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
                # but with stage-level scheduling, we can make training task grab the gpu.
                return False
            if float(task_gpu_amount) == float(executor_gpus):
                # spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
                # results in only 1 task running at a time, which may cause perf issue.
                return True
            # We can enable stage-level scheduling
            return False
        # CPU training doesn't require stage-level scheduling
        return True
    def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
        """Try to enable stage-level scheduling"""
        if self._skip_stage_level_scheduling():
            return rdd
        ss = _get_spark_session()
        # executor_cores will not be None
        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
        assert executor_cores is not None
        # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
        # If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
        # ETL gpu tasks running alongside training tasks.
        spark_plugins = ss.conf.get("spark.plugins", " ")
        assert spark_plugins is not None
        spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
        assert spark_rapids_sql_enabled is not None
        task_cores = (
            int(executor_cores)
            if "com.nvidia.spark.SQLPlugin" in spark_plugins
            and "true" == spark_rapids_sql_enabled.lower()
            else (int(executor_cores) // 2) + 1
        )
        # Each training task requires cpu cores > total executor cores//2 + 1 which can
        # make sure the tasks be sent to different executors.
        #
        # Please note that we can't use GPU to limit the concurrent tasks because of
        # https://issues.apache.org/jira/browse/SPARK-45527.
        task_gpus = 1.0
        treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
        rp = ResourceProfileBuilder().require(treqs).build
        self.logger.info(
            "XGBoost training tasks require the resource(cores=%s, gpu=%s).",
            task_cores,
            task_gpus,
        )
        return rdd.withResources(rp)
    def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
        # pylint: disable=too-many-statements, too-many-locals
        self._validate_params()
@ -986,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                )
        def _run_job() -> Tuple[str, str]:
-            ret = (
+            rdd = (
                dataset.mapInPandas(
-                    _train_booster, schema="config string, booster string"  # type: ignore
+                    _train_booster,  # type: ignore
                    schema="config string, booster string",
                )
                .rdd.barrier()
                .mapPartitions(lambda x: x)
                .collect()[0]
            )
            rdd_with_resource = self._try_stage_level_scheduling(rdd)
            ret = rdd_with_resource.collect()[0]
            return ret[0], ret[1]
        get_logger("XGBoost-PySpark").info(
@ -1117,12 +1243,111 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
            )
        return features_col, feature_col_names
    def _get_pred_contrib_col_name(self) -> Optional[str]:
        """Return the pred_contrib_col col name"""
        pred_contrib_col_name = None
        if (
            self.isDefined(self.pred_contrib_col)
            and self.getOrDefault(self.pred_contrib_col) != ""
        ):
            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
        return pred_contrib_col_name
    def _out_schema(self) -> Tuple[bool, str]:
        """Return the bool to indicate if it's a single prediction, true is single prediction,
        and the returned type of the user-defined function. The value must
        be a DDL-formatted type string."""
        if self._get_pred_contrib_col_name() is not None:
            return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
        return True, "double"
    def _get_predict_func(self) -> Callable:
        """Return the true prediction function which will be running on the executor side"""
        predict_params = self._gen_predict_params_dict()
        pred_contrib_col_name = self._get_pred_contrib_col_name()
        def _predict(
            model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
        ) -> Union[pd.DataFrame, pd.Series]:
            data = {}
            preds = model.predict(
                X,
                base_margin=base_margin,
                validate_features=False,
                **predict_params,
            )
            data[pred.prediction] = pd.Series(preds)
            if pred_contrib_col_name is not None:
                contribs = pred_contribs(model, X, base_margin)
                data[pred.pred_contrib] = pd.Series(list(contribs))
                return pd.DataFrame(data=data)
            return data[pred.prediction]
        return _predict
    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
        """Post process of transform"""
        prediction_col_name = self.getOrDefault(self.predictionCol)
        single_pred, _ = self._out_schema()
        if single_pred:
            if prediction_col_name:
                dataset = dataset.withColumn(prediction_col_name, pred_col)
        else:
            pred_struct_col = "_prediction_struct"
            dataset = dataset.withColumn(pred_struct_col, pred_col)
            if prediction_col_name:
                dataset = dataset.withColumn(
                    prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
                )
            pred_contrib_col_name = self._get_pred_contrib_col_name()
            if pred_contrib_col_name is not None:
                dataset = dataset.withColumn(
                    pred_contrib_col_name,
                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
                )
            dataset = dataset.drop(pred_struct_col)
        return dataset
    def _gpu_transform(self) -> bool:
        """If gpu is used to do the prediction, true to gpu prediction"""
        if _is_local(_get_spark_session().sparkContext):
            # if it's local model, we just use the internal "device"
            return use_cuda(self.getOrDefault(self.device))
        gpu_per_task = (
            _get_spark_session()
            .sparkContext.getConf()
            .get("spark.task.resource.gpu.amount")
        )
        # User don't set gpu configurations, just use cpu
        if gpu_per_task is None:
            if use_cuda(self.getOrDefault(self.device)):
                get_logger("XGBoost-PySpark").warning(
                    "Do the prediction on the CPUs since "
                    "no gpu configurations are set"
                )
            return False
        # User already sets the gpu configurations, we just use the internal "device".
        return use_cuda(self.getOrDefault(self.device))
    def _transform(self, dataset: DataFrame) -> DataFrame:
        # pylint: disable=too-many-statements, too-many-locals
        # Save xgb_sklearn_model and predict_params to be local variable
        # to avoid the `self` object to be pickled to remote.
        xgb_sklearn_model = self._xgb_sklearn_model
        predict_params = self._gen_predict_params_dict()
        has_base_margin = False
        if (
@ -1137,79 +1362,92 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
        features_col, feature_col_names = self._get_feature_col(dataset)
        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
-        pred_contrib_col_name = None
+        predict_func = self._get_predict_func()
        if (
            self.isDefined(self.pred_contrib_col)
            and self.getOrDefault(self.pred_contrib_col) != ""
        ):
            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
-        single_pred = True
+        _, schema = self._out_schema()
-        schema = "double"
+
-        if pred_contrib_col_name:
+        is_local = _is_local(_get_spark_session().sparkContext)
-            single_pred = False
+        run_on_gpu = self._gpu_transform()
            schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
        @pandas_udf(schema)  # type: ignore
        def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
            assert xgb_sklearn_model is not None
            model = xgb_sklearn_model
            from pyspark import TaskContext
            context = TaskContext.get()
            assert context is not None
            dev_ordinal = -1
            if is_cudf_available():
                if is_local:
                    if run_on_gpu and is_cupy_available():
                        import cupy as cp  # pylint: disable=import-error
                        total_gpus = cp.cuda.runtime.getDeviceCount()
                        if total_gpus > 0:
                            partition_id = context.partitionId()
                            # For transform local mode, default the dev_ordinal to
                            # (partition id) % gpus.
                            dev_ordinal = partition_id % total_gpus
                elif run_on_gpu:
                    dev_ordinal = _get_gpu_id(context)
                if dev_ordinal >= 0:
                    device = "cuda:" + str(dev_ordinal)
                    get_logger("XGBoost-PySpark").info(
                        "Do the inference with device: %s", device
                    )
                    model.set_params(device=device)
                else:
                    get_logger("XGBoost-PySpark").info("Do the inference on the CPUs")
            else:
                msg = (
                    "CUDF is unavailable, fallback the inference on the CPUs"
                    if run_on_gpu
                    else "Do the inference on the CPUs"
                )
                get_logger("XGBoost-PySpark").info(msg)
            def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
                """Move the data to gpu if possible"""
                if dev_ordinal >= 0:
                    import cudf  # pylint: disable=import-error
                    import cupy as cp  # pylint: disable=import-error
                    # We must set the device after import cudf, which will change the device id to 0
                    # See https://github.com/rapidsai/cudf/issues/11386
                    cp.cuda.runtime.setDevice(dev_ordinal)  # pylint: disable=I1101
                    df = cudf.DataFrame(data)
                    del data
                    return df
                return data
            for data in iterator:
                if enable_sparse_data_optim:
                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
                else:
                    if feature_col_names is not None:
-                        X = data[feature_col_names]
+                        tmp = data[feature_col_names]
                    else:
-                        X = stack_series(data[alias.data])
+                        tmp = stack_series(data[alias.data])
                    X = to_gpu_if_possible(tmp)
                if has_base_margin:
-                    base_margin = data[alias.margin].to_numpy()
+                    base_margin = to_gpu_if_possible(data[alias.margin])
                else:
                    base_margin = None
-                data = {}
+                yield predict_func(model, X, base_margin)
                preds = model.predict(
                    X,
                    base_margin=base_margin,
                    validate_features=False,
                    **predict_params,
                )
                data[pred.prediction] = pd.Series(preds)
                if pred_contrib_col_name:
                    contribs = pred_contribs(model, X, base_margin)
                    data[pred.pred_contrib] = pd.Series(list(contribs))
                    yield pd.DataFrame(data=data)
                else:
                    yield data[pred.prediction]
        if has_base_margin:
            pred_col = predict_udf(struct(*features_col, base_margin_col))
        else:
            pred_col = predict_udf(struct(*features_col))
-        prediction_col_name = self.getOrDefault(self.predictionCol)
+        return self._post_transform(dataset, pred_col)
        if single_pred:
            dataset = dataset.withColumn(prediction_col_name, pred_col)
        else:
            pred_struct_col = "_prediction_struct"
            dataset = dataset.withColumn(pred_struct_col, pred_col)
            dataset = dataset.withColumn(
                prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
            )
            if pred_contrib_col_name:
                dataset = dataset.withColumn(
                    pred_contrib_col_name,
                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
                )
            dataset = dataset.drop(pred_struct_col)
        return dataset
 class _ClassificationModel(  # pylint: disable=abstract-method
@ -1221,22 +1459,21 @@ class _ClassificationModel(  # pylint: disable=abstract-method
    .. Note:: This API is experimental.
    """
-    def _transform(self, dataset: DataFrame) -> DataFrame:
+    def _out_schema(self) -> Tuple[bool, str]:
-        # pylint: disable=too-many-statements, too-many-locals
+        schema = (
-        # Save xgb_sklearn_model and predict_params to be local variable
+            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
-        # to avoid the `self` object to be pickled to remote.
+            f" {pred.probability} array<double>"
-        xgb_sklearn_model = self._xgb_sklearn_model
+        )
-        predict_params = self._gen_predict_params_dict()
+        if self._get_pred_contrib_col_name() is not None:
            # We will force setting strict_shape to True when predicting contribs,
            # So, it will also output 3-D shape result.
            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
-        has_base_margin = False
+        return False, schema
-        if (
+
-            self.isDefined(self.base_margin_col)
+    def _get_predict_func(self) -> Callable:
-            and self.getOrDefault(self.base_margin_col) != ""
+        predict_params = self._gen_predict_params_dict()
-        ):
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
            has_base_margin = True
            base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
                alias.margin
            )
        def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
            if margins.ndim == 1:
@ -1251,76 +1488,38 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                class_probs = softmax(raw_preds, axis=1)
            return raw_preds, class_probs
-        features_col, feature_col_names = self._get_feature_col(dataset)
+        def _predict(
-        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
+            model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
        ) -> Union[pd.DataFrame, pd.Series]:
            margins = model.predict(
                X,
                base_margin=base_margin,
                output_margin=True,
                validate_features=False,
                **predict_params,
            )
            raw_preds, class_probs = transform_margin(margins)
-        pred_contrib_col_name = None
+            # It seems that they use argmax of class probs,
-        if (
+            # not of margin to get the prediction (Note: scala implementation)
-            self.isDefined(self.pred_contrib_col)
+            preds = np.argmax(class_probs, axis=1)
-            and self.getOrDefault(self.pred_contrib_col) != ""
+            result: Dict[str, pd.Series] = {
-        ):
+                pred.raw_prediction: pd.Series(list(raw_preds)),
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+                pred.prediction: pd.Series(preds),
                pred.probability: pd.Series(list(class_probs)),
            }
-        schema = (
+            if pred_contrib_col_name is not None:
-            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
+                contribs = pred_contribs(model, X, base_margin, strict_shape=True)
-            f" {pred.probability} array<double>"
+                result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
        )
        if pred_contrib_col_name:
            # We will force setting strict_shape to True when predicting contribs,
            # So, it will also output 3-D shape result.
            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
-        @pandas_udf(schema)  # type: ignore
+            return pd.DataFrame(data=result)
        def predict_udf(
            iterator: Iterator[Tuple[pd.Series, ...]]
        ) -> Iterator[pd.DataFrame]:
            assert xgb_sklearn_model is not None
            model = xgb_sklearn_model
            for data in iterator:
                if enable_sparse_data_optim:
                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
                else:
                    if feature_col_names is not None:
                        X = data[feature_col_names]  # type: ignore
                    else:
                        X = stack_series(data[alias.data])
-                if has_base_margin:
+        return _predict
                    base_margin = stack_series(data[alias.margin])
                else:
                    base_margin = None
                margins = model.predict(
                    X,
                    base_margin=base_margin,
                    output_margin=True,
                    validate_features=False,
                    **predict_params,
                )
                raw_preds, class_probs = transform_margin(margins)
                # It seems that they use argmax of class probs,
                # not of margin to get the prediction (Note: scala implementation)
                preds = np.argmax(class_probs, axis=1)
                result: Dict[str, pd.Series] = {
                    pred.raw_prediction: pd.Series(list(raw_preds)),
                    pred.prediction: pd.Series(preds),
                    pred.probability: pd.Series(list(class_probs)),
                }
                if pred_contrib_col_name:
                    contribs = pred_contribs(model, X, base_margin, strict_shape=True)
                    result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
                yield pd.DataFrame(data=result)
        if has_base_margin:
            pred_struct = predict_udf(struct(*features_col, base_margin_col))
        else:
            pred_struct = predict_udf(struct(*features_col))
    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
        pred_struct_col = "_prediction_struct"
-        dataset = dataset.withColumn(pred_struct_col, pred_struct)
+        dataset = dataset.withColumn(pred_struct_col, pred_col)
        raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
        if raw_prediction_col_name:
@ -1342,7 +1541,8 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                array_to_vector(getattr(col(pred_struct_col), pred.probability)),
            )
-        if pred_contrib_col_name:
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
        if pred_contrib_col_name is not None:
            dataset = dataset.withColumn(
                pred_contrib_col_name,
                getattr(col(pred_struct_col), pred.pred_contrib),
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@ -10,7 +10,7 @@ from threading import Thread
 from typing import Any, Callable, Dict, Optional, Set, Type
 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles
+from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession
 from xgboost import Booster, XGBModel, collective
@ -129,7 +129,14 @@ def _is_local(spark_context: SparkContext) -> bool:
    return spark_context._jsc.sc().isLocal()
-def _get_gpu_id(task_context: BarrierTaskContext) -> int:
+def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
    master = spark_context.getConf().get("spark.master")
    return master is not None and (
        master.startswith("spark://") or master.startswith("local-cluster")
    )
 def _get_gpu_id(task_context: TaskContext) -> int:
    """Get the gpu id from the task resources"""
    if task_context is None:
        # This is a safety check.
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
    with pytest.raises(ValueError, match="Either `group` or `qid`."):
        ranker.fit(df, y, eval_set=[(X, y)])
 def run_ranking_categorical(device: str) -> None:
    """Test LTR with categorical features."""
    from sklearn.model_selection import cross_val_score
    X, y = tm.make_categorical(
        n_samples=512, n_features=10, n_categories=3, onehot=False
    )
    rng = np.random.default_rng(1994)
    qid = rng.choice(3, size=y.shape[0])
    qid = np.sort(qid)
    X["qid"] = qid
    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
    ltr.fit(X, y)
    score = ltr.score(X, y)
    assert score > 0.9
    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
    # test using the score function inside sklearn.
    scores = cross_val_score(ltr, X, y)
    for s in scores:
        assert s > 0.7
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@ -52,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
 *
 *   Go to left if it's NOT the matching category, which matches one-hot encoding.
 */
-inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
+inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
  KCatBitField const s_cats(cats);
  if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
    return true;
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@ -3,9 +3,11 @@
 */
 #include "error_msg.h"
 #include <mutex>    // for call_once, once_flag
 #include <sstream>  // for stringstream
 #include "../collective/communicator-inl.h"  // for GetRank
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/logging.h"
 namespace xgboost::error {
@ -26,34 +28,43 @@ void WarnDeprecatedGPUHist() {
 }
 void WarnManualUpdater() {
-  bool static thread_local logged{false};
+  static std::once_flag flag;
-  if (logged) {
+  std::call_once(flag, [] {
-    return;
+    LOG(WARNING)
-  }
+        << "You have manually specified the `updater` parameter. The `tree_method` parameter "
-  LOG(WARNING)
+           "will be ignored. Incorrect sequence of updaters will produce undefined "
-      << "You have manually specified the `updater` parameter. The `tree_method` parameter "
+           "behavior. For common uses, we recommend using `tree_method` parameter instead.";
-         "will be ignored. Incorrect sequence of updaters will produce undefined "
+  });
         "behavior. For common uses, we recommend using `tree_method` parameter instead.";
  logged = true;
 }
 void WarnDeprecatedGPUId() {
-  static thread_local bool logged{false};
+  static std::once_flag flag;
-  if (logged) {
+  std::call_once(flag, [] {
-    return;
+    auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
-  }
+    msg += " E.g. device=cpu/cuda/cuda:0";
-  auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
+    LOG(WARNING) << msg;
-  msg += " E.g. device=cpu/cuda/cuda:0";
+  });
  LOG(WARNING) << msg;
  logged = true;
 }
 void WarnEmptyDataset() {
-  static thread_local bool logged{false};
+  static std::once_flag flag;
-  if (logged) {
+  std::call_once(flag,
-    return;
+                 [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
-  }
+}
-  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+
-  logged = true;
+void MismatchedDevices(Context const* booster, Context const* data) {
  static std::once_flag flag;
  std::call_once(flag, [&] {
    LOG(WARNING)
        << "Falling back to prediction using DMatrix due to mismatched devices. This might "
           "lead to higher memory usage and slower performance. XGBoost is running on: "
        << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
        << R"(Potential solutions:
 - Use a data structure that matches the device ordinal in the booster.
 - Set the device for booster before call to inplace_predict.
 This warning will only be shown once.
 )";
  });
 }
 }  // namespace xgboost::error
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@ -10,7 +10,8 @@
 #include <limits>     // for numeric_limits
 #include <string>     // for string
-#include "xgboost/base.h"  // for bst_feature_t
+#include "xgboost/base.h"     // for bst_feature_t
 #include "xgboost/context.h"  // for Context
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView
@ -94,5 +95,7 @@ constexpr StringView InvalidCUDAOrdinal() {
  return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
         "available for using GPU.";
 }
 void MismatchedDevices(Context const* booster, Context const* data);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/io.h
+++ b/src/common/io.h
@ -384,7 +384,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
   * @param length    See the `length` parameter of `mmap` for details.
   */
  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+      : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
            new MmapResource{std::move(path), offset, length}}} {}
  ~PrivateMmapConstStream() noexcept(false) override;
 };
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@ -76,7 +76,7 @@ class RefResourceView {
  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
  [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
+    return Span<const value_type>{data(), size()}.size_bytes();
  }
  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@ -3,14 +3,23 @@
 */
 #include "threading_utils.h"
-#include <fstream>
+#include <algorithm>   // for max
-#include <string>
+#include <exception>   // for exception
 #include <filesystem>  // for path, exists
 #include <fstream>     // for ifstream
 #include <string>      // for string
-#include "xgboost/logging.h"
+#include "common.h"  // for DivRoundUp
-namespace xgboost {
+namespace xgboost::common {
-namespace common {
+/**
-int32_t GetCfsCPUCount() noexcept {
+ * Modified from
 * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
 *
 * MIT License: Copyright (c) 2016 Domagoj Šarić
 */
 std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
                              std::filesystem::path const& peroid_path) {
 #if defined(__linux__)
  // https://bugs.openjdk.java.net/browse/JDK-8146115
  // http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
    }
  };
  // complete fair scheduler from Linux
-  auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
+  auto const cfs_quota(read_int(quota_path.c_str()));
-  auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
+  auto const cfs_period(read_int(peroid_path.c_str()));
  if ((cfs_quota > 0) && (cfs_period > 0)) {
    return std::max(cfs_quota / cfs_period, 1);
  }
@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
  return -1;
 }
 std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
  std::int32_t cnt{-1};
 #if defined(__linux__)
  namespace fs = std::filesystem;
  std::int32_t a{0}, b{0};
  auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
  try {
    std::ifstream fin{bandwidth_path, std::ios::in};
    fin >> a;
    fin >> b;
  } catch (std::exception const&) {
    warn();
    return cnt;
  }
  if (a > 0 && b > 0) {
    cnt = std::max(common::DivRoundUp(a, b), 1);
  }
 #endif  //  defined(__linux__)
  return cnt;
 }
 std::int32_t GetCfsCPUCount() noexcept {
  namespace fs = std::filesystem;
  fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
  auto has_v2 = fs::exists(bandwidth_path);
  if (has_v2) {
    return GetCGroupV2Count(bandwidth_path);
  }
  fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
  fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
  auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
  if (has_v1) {
    return GetCGroupV1Count(quota_path, peroid_path);
  }
  return -1;
 }
 std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
  // Don't use parallel if we are in a parallel region.
  if (omp_in_parallel()) {
@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
  n_threads = std::max(n_threads, 1);
  return n_threads;
 }
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
 * \brief Get thread limit from CFS.
 *
 *   This function has non-trivial overhead and should not be called repeatly.
 *
 * Modified from
 * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
 *
 * MIT License: Copyright (c) 2016 Domagoj Šarić
 */
 std::int32_t GetCfsCPUCount() noexcept;
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@ -55,6 +55,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
  }
  CHECK(p_fmat) << "Failed to fallback.";
  p_fmat->Info() = proxy->Info().Copy();
  return p_fmat;
 }
 }  // namespace xgboost::data
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2014-2022 by XGBoost Contributors
+ * Copyright 2014-2023, XGBoost Contributors
 * \file gblinear.cc
 * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
 *        the update rule is parallel coordinate descent (shotgun)
@ -26,9 +26,9 @@
 #include "../common/timer.h"
 #include "../common/common.h"
 #include "../common/threading_utils.h"
 #include "../common/error_msg.h"
-namespace xgboost {
+namespace xgboost::gbm {
 namespace gbm {
 DMLC_REGISTRY_FILE_TAG(gblinear);
@ -83,7 +83,16 @@ class GBLinear : public GradientBooster {
    }
    param_.UpdateAllowUnknown(cfg);
    param_.CheckGPUSupport();
-    updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
+    if (param_.updater == "gpu_coord_descent") {
      LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
                                            R"(device="cuda", updater="coord_descent")");
    }
    if (param_.updater == "coord_descent" && ctx_->IsCUDA()) {
      updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_));
    } else {
      updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
    }
    updater_->Configure(cfg);
    monitor_.Init("GBLinear");
  }
@ -354,5 +363,4 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
    .set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
      return new GBLinear(booster_config, ctx);
    });
-}  // namespace gbm
+}  // namespace xgboost::gbm
 }  // namespace xgboost
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -85,25 +85,6 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                      return name == up->Name();
                    });
 }
 void MismatchedDevices(Context const* booster, Context const* data) {
  bool thread_local static logged{false};
  if (logged) {
    return;
  }
  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
                  "lead to higher memory usage and slower performance. XGBoost is running on: "
               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
               << ".\n"
               << R"(Potential solutions:
 - Use a data structure that matches the device ordinal in the booster.
 - Set the device for booster before call to inplace_predict.
 This warning will only be shown once for each thread. Subsequent warnings made by the
 current thread will be suppressed.
 )";
  logged = true;
 }
 }  // namespace
 void GBTree::Configure(Args const& cfg) {
@ -146,14 +127,6 @@ void GBTree::Configure(Args const& cfg) {
  if (specified_updater_) {
    error::WarnManualUpdater();
  }
  if (model_.learner_model_param->IsVectorLeaf()) {
    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
        << "Only the hist tree method is supported for building multi-target trees with vector "
           "leaf.";
    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
  }
  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
  if (!specified_updater_) {
@ -225,6 +198,13 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
 void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  if (model_.learner_model_param->IsVectorLeaf()) {
    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
        << "Only the hist tree method is supported for building multi-target trees with vector "
           "leaf.";
    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
  }
  TreesOneIter new_trees;
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
  monitor_.Start("BoostNewTrees");
@ -555,7 +535,7 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
  auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
  if (p_m->Ctx()->Device() != this->ctx_->Device()) {
-    MismatchedDevices(this->ctx_, p_m->Ctx());
+    error::MismatchedDevices(this->ctx_, p_m->Ctx());
    CHECK_EQ(out_preds->version, 0);
    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
    CHECK(proxy) << error::InplacePredictProxy();
@ -808,7 +788,7 @@ class Dart : public GBTree {
    auto n_groups = model_.learner_model_param->num_output_group;
    if (ctx_->Device() != p_fmat->Ctx()->Device()) {
-      MismatchedDevices(ctx_, p_fmat->Ctx());
+      error::MismatchedDevices(ctx_, p_fmat->Ctx());
      auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
      CHECK(proxy) << error::InplacePredictProxy();
      auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
--- a/src/learner.cc
+++ b/src/learner.cc
@ -1317,7 +1317,9 @@ class LearnerImpl : public LearnerIO {
    if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
      auto config = obj_->DefaultMetricConfig();
-      metrics_.back()->LoadConfig(config);
+      if (!IsA<Null>(config)) {
        metrics_.back()->LoadConfig(config);
      }
      metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
    }
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@ -9,8 +9,7 @@
 #include "coordinate_common.h"
 #include "xgboost/json.h"
-namespace xgboost {
+namespace xgboost::linear {
 namespace linear {
 DMLC_REGISTER_PARAMETER(CoordinateParam);
 DMLC_REGISTRY_FILE_TAG(updater_coordinate);
@ -39,8 +38,9 @@ class CoordinateUpdater : public LinearUpdater {
    FromJson(config.at("linear_train_param"), &tparam_);
    FromJson(config.at("coordinate_param"), &cparam_);
  }
-  void SaveConfig(Json* p_out) const override {
+  void SaveConfig(Json *p_out) const override {
-    auto& out = *p_out;
+    LOG(DEBUG) << "Save config for CPU updater.";
    auto &out = *p_out;
    out["linear_train_param"] = ToJson(tparam_);
    out["coordinate_param"] = ToJson(cparam_);
  }
@ -99,5 +99,4 @@ class CoordinateUpdater : public LinearUpdater {
 XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
    .describe("Update linear model according to coordinate descent algorithm.")
    .set_body([]() { return new CoordinateUpdater(); });
-}  // namespace linear
+}  // namespace xgboost::linear
 }  // namespace xgboost
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -15,8 +15,7 @@
 #include "../common/timer.h"
 #include "./param.h"
-namespace xgboost {
+namespace xgboost::linear {
 namespace linear {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
@ -29,7 +28,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
 class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 public:
  // set training parameter
-  void Configure(Args const& args) override {
+  void Configure(Args const &args) override {
    tparam_.UpdateAllowUnknown(args);
    coord_param_.UpdateAllowUnknown(args);
    selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
@ -41,8 +40,9 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    FromJson(config.at("linear_train_param"), &tparam_);
    FromJson(config.at("coordinate_param"), &coord_param_);
  }
-  void SaveConfig(Json* p_out) const override {
+  void SaveConfig(Json *p_out) const override {
-    auto& out = *p_out;
+    LOG(DEBUG) << "Save config for GPU updater.";
    auto &out = *p_out;
    out["linear_train_param"] = ToJson(tparam_);
    out["coordinate_param"] = ToJson(coord_param_);
  }
@ -101,10 +101,9 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    monitor_.Stop("LazyInitDevice");
    monitor_.Start("UpdateGpair");
    auto &in_gpair_host = in_gpair->ConstHostVector();
    // Update gpair
    if (ctx_->gpu_id >= 0) {
-      this->UpdateGpair(in_gpair_host);
+      this->UpdateGpair(in_gpair->ConstHostVector());
    }
    monitor_.Stop("UpdateGpair");
@ -249,5 +248,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
        "Update linear model according to coordinate descent algorithm. GPU "
        "accelerated.")
    .set_body([]() { return new GPUCoordinateUpdater(); });
-}  // namespace linear
+}  // namespace xgboost::linear
 }  // namespace xgboost
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@ -268,6 +268,13 @@ class PseudoHuberRegression : public FitIntercept {
    }
    FromJson(in["pseudo_huber_param"], &param_);
  }
  [[nodiscard]] Json DefaultMetricConfig() const override {
    CHECK(param_.GetInitialised());
    Json config{Object{}};
    config["name"] = String{this->DefaultEvalMetric()};
    config["pseudo_huber_param"] = ToJson(param_);
    return config;
  }
 };
 XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2020-2022 by XGBoost Contributors
+ * Copyright 2020-2023, XGBoost Contributors
 */
 #include <algorithm>  // std::max
 #include <vector>
@ -11,9 +11,7 @@
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
-namespace xgboost {
+namespace xgboost::tree {
 namespace tree {
 // With constraints
 XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
                                       const GradientPairInt64 &missing,
@ -315,11 +313,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
                                    common::Span<common::CatBitField::value_type> out,
                                    DeviceSplitCandidate *p_out_split) {
  auto &out_split = *p_out_split;
-  out_split.split_cats = common::CatBitField{out};
+  auto out_cats = common::CatBitField{out};
  // Simple case for one hot split
  if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
-    out_split.split_cats.Set(common::AsCat(out_split.thresh));
+    out_cats.Set(common::AsCat(out_split.thresh));
    return;
  }
@ -339,7 +337,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
  assert(partition > 0 && "Invalid partition.");
  thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
    auto cat = shared_inputs.feature_values[c - node_offset];
-    out_split.SetCat(cat);
+    out_cats.Set(common::AsCat(cat));
  });
 }
@ -427,8 +425,7 @@ void GPUHistEvaluator::EvaluateSplits(
    if (split.is_cat) {
      SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
-                          device_cats_accessor.GetNodeCatStorage(input.nidx),
+                          device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
                          &out_splits[i]);
    }
    float base_weight =
@ -460,6 +457,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
                                cudaMemcpyDeviceToHost));
  return root_entry;
 }
-
+}  // namespace xgboost::tree
 }  // namespace tree
 }  // namespace xgboost
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
  common::Span<const float> feature_values;
  common::Span<const float> min_fvalue;
  bool is_dense;
-  XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
+  [[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
-  __device__ auto FeatureBins(bst_feature_t fidx) const {
+  [[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
    return feature_segments[fidx + 1] - feature_segments[fidx];
  }
 };
@ -102,7 +102,7 @@ class GPUHistEvaluator {
  }
  /**
-   * \brief Get device category storage of nidx for internal calculation.
+   * @brief Get device category storage of nidx for internal calculation.
   */
  auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
    if (!has_categoricals_) return CatAccessor{};
@ -117,8 +117,8 @@ class GPUHistEvaluator {
  /**
   * \brief Get sorted index storage based on the left node of inputs.
   */
-  auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
+  auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
-    if(!need_sort_histogram_) return common::Span<bst_feature_t>();
+    if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
    cat_sorted_idx_.resize(num_nodes * total_bins);
    return dh::ToSpan(cat_sorted_idx_);
  }
@ -142,12 +142,22 @@ class GPUHistEvaluator {
   * \brief Get host category storage for nidx.  Different from the internal version, this
   *        returns strictly 1 node.
   */
-  common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
+  [[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
    copy_stream_.View().Sync();
    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
        nidx * node_categorical_storage_size_, node_categorical_storage_size_);
    return cats_out;
  }
  [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
    copy_stream_.View().Sync();
    if (has_categoricals_) {
      CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
      return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
    } else {
      return common::KCatBitField{};
    }
  }
  /**
   * \brief Add a split to the internal tree evaluator.
   */
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@ -64,7 +64,6 @@ struct DeviceSplitCandidate {
  // split.
  bst_cat_t thresh{-1};
  common::CatBitField split_cats;
  bool is_cat { false };
  GradientPairInt64 left_sum;
@ -72,12 +71,6 @@ struct DeviceSplitCandidate {
  XGBOOST_DEVICE DeviceSplitCandidate() {}  // NOLINT
  template <typename T>
  XGBOOST_DEVICE void SetCat(T c) {
    this->split_cats.Set(common::AsCat(c));
    fvalue = std::max(this->fvalue, static_cast<float>(c));
  }
  XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
                             int findex_in, GradientPairInt64 left_sum_in,
                             GradientPairInt64 right_sum_in, bool cat,
@ -100,22 +93,23 @@ struct DeviceSplitCandidate {
   */
  XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
                                bst_feature_t findex_in, GradientPairInt64 left_sum_in,
-                                GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) {
+                                GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
-    if (loss_chg_in > loss_chg &&
+                                const GradientQuantiser& quantiser) {
-        quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
+      if (loss_chg_in > loss_chg &&
-        quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
+          quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
-      loss_chg = loss_chg_in;
+          quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
-      dir = dir_in;
+        loss_chg = loss_chg_in;
-      fvalue = std::numeric_limits<float>::quiet_NaN();
+        dir = dir_in;
-      thresh = thresh_in;
+        fvalue = std::numeric_limits<float>::quiet_NaN();
-      is_cat = true;
+        thresh = thresh_in;
-      left_sum = left_sum_in;
+        is_cat = true;
-      right_sum = right_sum_in;
+        left_sum = left_sum_in;
-      findex = findex_in;
+        right_sum = right_sum_in;
-    }
+        findex = findex_in;
      }
  }
-  XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
  friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
    os << "loss_chg:" << c.loss_chg << ", "
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -7,9 +7,9 @@
 #include <algorithm>
 #include <cmath>
-#include <limits>
+#include <cstddef>  // for size_t
-#include <memory>
+#include <memory>   // for unique_ptr, make_unique
-#include <utility>
+#include <utility>  // for move
 #include <vector>
 #include "../collective/communicator-inl.cuh"
@ -216,9 +216,9 @@ struct GPUHistMakerDevice {
  void InitFeatureGroupsOnce() {
    if (!feature_groups) {
      CHECK(page);
-      feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
+      feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
-                                             dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                             sizeof(GradientPairPrecise)));
+                                                       sizeof(GradientPairPrecise));
    }
  }
@ -244,10 +244,10 @@ struct GPUHistMakerDevice {
    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
-    quantiser.reset(new GradientQuantiser(this->gpair));
+    quantiser = std::make_unique<GradientQuantiser>(this->gpair);
    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
    // Init histogram
    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
@ -294,7 +294,7 @@ struct GPUHistMakerDevice {
    dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
    // Store the feature set ptrs so they dont go out of scope before the kernel is called
    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
-    for (size_t i = 0; i < candidates.size(); i++) {
+    for (std::size_t i = 0; i < candidates.size(); i++) {
      auto candidate = candidates.at(i);
      int left_nidx = tree[candidate.nid].LeftChild();
      int right_nidx = tree[candidate.nid].RightChild();
@ -327,14 +327,13 @@ struct GPUHistMakerDevice {
        d_node_inputs.data().get(), h_node_inputs.data(),
        h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
-    this->evaluator_.EvaluateSplits(nidx, max_active_features,
+    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
-                                    dh::ToSpan(d_node_inputs), shared_inputs,
+                                    shared_inputs, dh::ToSpan(entries));
                                    dh::ToSpan(entries));
    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                  cudaMemcpyDeviceToHost));
    dh::DefaultStream().Sync();
-    }
+  }
  void BuildHist(int nidx) {
    auto d_node_hist = hist.GetNodeHistogram(nidx);
@ -366,23 +365,29 @@ struct GPUHistMakerDevice {
  struct NodeSplitData {
    RegTree::Node split_node;
    FeatureType split_type;
-    common::CatBitField node_cats;
+    common::KCatBitField node_cats;
  };
-  void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
+  void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
-    if (candidates.empty()) return;
+    if (candidates.empty()) {
-    std::vector<int> nidx(candidates.size());
+      return;
-    std::vector<int> left_nidx(candidates.size());
+    }
-    std::vector<int> right_nidx(candidates.size());
+
    std::vector<bst_node_t> nidx(candidates.size());
    std::vector<bst_node_t> left_nidx(candidates.size());
    std::vector<bst_node_t> right_nidx(candidates.size());
    std::vector<NodeSplitData> split_data(candidates.size());
    for (size_t i = 0; i < candidates.size(); i++) {
-      auto& e = candidates[i];
+      auto const& e = candidates[i];
      RegTree::Node split_node = (*p_tree)[e.nid];
      auto split_type = p_tree->NodeSplitType(e.nid);
      nidx.at(i) = e.nid;
      left_nidx.at(i) = split_node.LeftChild();
      right_nidx.at(i) = split_node.RightChild();
-      split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
+      split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
    }
    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@ -390,7 +395,7 @@ struct GPUHistMakerDevice {
        nidx, left_nidx, right_nidx, split_data,
        [=] __device__(bst_uint ridx, const NodeSplitData& data) {
          // given a row index, returns the node id it belongs to
-          bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
+          float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
          // Missing value
          bool go_left = true;
          if (isnan(cut_value)) {
@ -620,7 +625,6 @@ struct GPUHistMakerDevice {
      CHECK(common::CheckNAN(candidate.split.fvalue));
      std::vector<common::CatBitField::value_type> split_cats;
      CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
      auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
      split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
--- a/tests/buildkite/build-jvm-packages.sh
+++ b/tests/buildkite/build-jvm-packages.sh
@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
 tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
  ${SPARK_VERSION}
 echo "--- Stash XGBoost4J JARs (Scala 2.12)"
 buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
 echo "--- Build XGBoost JVM packages scala 2.13"
 tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
  ${SPARK_VERSION} "" "" "true"
-echo "--- Stash XGBoost4J JARs"
+echo "--- Stash XGBoost4J JARs (Scala 2.13)"
 buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
--- a/tests/buildkite/pipeline-mac-m1.yml
+++ b/tests/buildkite/pipeline-mac-m1.yml
@ -0,0 +1,8 @@
 steps:
  - block: ":rocket: Run this test job"
    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
  - label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11"
    command: "tests/buildkite/test-macos-m1-clang11.sh"
    key: mac-m1-appleclang11
    agents:
      queue: mac-mini-m1
--- a/tests/buildkite/test-macos-m1-clang11.sh
+++ b/tests/buildkite/test-macos-m1-clang11.sh
@ -0,0 +1,50 @@
 #!/bin/bash
 set -euo pipefail
 source tests/buildkite/conftest.sh
 # Display system info
 echo "--- Display system information"
 set -x
 system_profiler SPSoftwareDataType
 sysctl -n machdep.cpu.brand_string
 uname -m
 set +x
 # Build XGBoost4J binary
 echo "--- Build libxgboost4j.dylib"
 set -x
 mkdir build
 pushd build
 export JAVA_HOME=$(/usr/libexec/java_home)
 cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15
 ninja -v
 popd
 rm -rf build
 set +x
 echo "--- Upload Python wheel"
 set -x
 pushd lib
 mv -v libxgboost4j.dylib libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
 buildkite-agent artifact upload libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
 if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
 then
  aws s3 cp libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib \
    s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \
    --acl public-read --no-progress
 fi
 popd
 set +x
 # Ensure that XGBoost can be built with Clang 11
 echo "--- Build and Test XGBoost with MacOS M1, Clang 11"
 set -x
 LLVM11_PATH=$(brew --prefix llvm\@11)
 mkdir build
 pushd build
 cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \
  -DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \
  -DUSE_DMLC_GTEST=ON
 ninja -v
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 # Install all basic requirements
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
--- a/tests/ci_build/build_jvm_packages.sh
+++ b/tests/ci_build/build_jvm_packages.sh
@ -27,6 +27,9 @@ fi
 mvn_profile_string=""
 if [ "x$use_scala213" != "x" ]; then
  export mvn_profile_string="-Pdefault,scala-2.13"
  cd ..
  python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
  cd jvm-packages
 fi
 mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@ -32,11 +32,10 @@ dependencies:
 - jsonschema
 - boto3
 - awscli
 - py-ubjson
 - cffi
 - pyarrow
 - pyspark>=3.4.0
 - cloudpickle
 - pip:
  - sphinx_rtd_theme
-  - datatable
+  - py-ubjson
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@ -27,6 +27,9 @@ rm -rf ../build/
 # Deploy to S3 bucket xgboost-maven-repo
 mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
 # Deploy scala 2.13 to S3 bucket xgboost-maven-repo
 cd ..
 python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
 cd jvm-packages/
 mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
--- a/tests/ci_build/test_jvm_cross.sh
+++ b/tests/ci_build/test_jvm_cross.sh
@ -21,9 +21,18 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
 fi
 # including maven profiles for different scala versions: 2.12 is the default at the moment.
-for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
+for scala_binary_version in "2.12" "2.13"; do
  cd ..
  python dev/change_scala_version.py --scala-version ${scala_binary_version}
  cd jvm-packages
  scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
-  scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
+  if [[ "$scala_binary_version" == "2.12" ]]; then
    _maven_profile_string=""
  elif [[ "$scala_binary_version" == "2.13" ]]; then
    _maven_profile_string="-Pdefault,scala-2.13"
  else
    echo "Unexpected scala version: $scala_version ($scala_binary_version)."
  fi
  # Install XGBoost4J JAR into local Maven repository
  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@ -148,7 +148,8 @@ TEST(IO, Resource) {
    fout << 1.0 << std::endl;
    fout.close();
-    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    auto resource = std::shared_ptr<MmapResource>{
      new MmapResource{path, 0, sizeof(double)}};
    ASSERT_EQ(resource->Size(), sizeof(double));
    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
    ASSERT_EQ(resource->DataAs<double>()[0], val);
--- a/tests/cpp/gbm/test_gblinear.cu
+++ b/tests/cpp/gbm/test_gblinear.cu
@ -0,0 +1,42 @@
 /**
 * Copyright 2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/global_config.h>  // for GlobalConfigThreadLocalStore
 #include <xgboost/json.h>           // for Json, Object
 #include <xgboost/learner.h>        // for Learner
 #include <algorithm>  // for transform
 #include <string>     // for string
 #include <utility>    // for swap
 #include "../helpers.h"  // for RandomDataGenerator
 namespace xgboost {
 TEST(GBlinear, DispatchUpdater) {
  auto verbosity = 3;
  std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
  auto test = [](std::string device) {
    auto p_fmat = RandomDataGenerator{10, 10, 0.0f}.GenerateDMatrix(true);
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
    learner->SetParams(
        Args{{"booster", "gblinear"}, {"updater", "coord_descent"}, {"device", device}});
    learner->Configure();
    for (std::int32_t iter = 0; iter < 3; ++iter) {
      learner->UpdateOneIter(iter, p_fmat);
    }
    Json config{Object{}};
    ::testing::internal::CaptureStderr();
    learner->SaveConfig(&config);
    auto str = ::testing::internal::GetCapturedStderr();
    std::transform(device.cbegin(), device.cend(), device.begin(),
                   [](char c) { return std::toupper(c); });
    ASSERT_NE(str.find(device), std::string::npos);
  };
  test("cpu");
  test("gpu");
  std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
 }
 }  // namespace xgboost
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@ -58,21 +58,6 @@ void TestInplaceFallback(Context const* ctx) {
  HostDeviceVector<float>* out_predt{nullptr};
  ConsoleLogger::Configure(Args{{"verbosity", "1"}});
  std::string output;
  // test whether the warning is raised
 #if !defined(_WIN32)
  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
  // might be related to https://github.com/dmlc/xgboost/issues/5793
  ::testing::internal::CaptureStderr();
  std::thread{[&] {
    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
    // warning by using thread-local flags.
    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                            &out_predt, 0, 0);
  }}.join();
  output = testing::internal::GetCapturedStderr();
  ASSERT_NE(output.find("Falling back"), std::string::npos);
 #endif
  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                          &out_predt, 0, 0);
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@ -6,6 +6,7 @@
 #include <xgboost/objective.h>
 #include "../helpers.h"
 #include "../objective_helpers.h"
 TEST(Objective, UnknownFunction) {
  xgboost::ObjFunction* obj = nullptr;
@ -43,4 +44,61 @@ TEST(Objective, PredTransform) {
    ASSERT_TRUE(predts.HostCanWrite());
  }
 }
 class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
  Context ctx_;
 public:
  void Run(std::string objective) {
    auto Xy = MakeFmatForObjTest(objective);
    std::unique_ptr<Learner> learner{Learner::Create({Xy})};
    std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
    learner->SetParam("objective", objective);
    if (objective.find("multi") != std::string::npos) {
      learner->SetParam("num_class", "3");
      objfn->Configure(Args{{"num_class", "3"}});
    } else if (objective.find("quantile") != std::string::npos) {
      learner->SetParam("quantile_alpha", "0.5");
      objfn->Configure(Args{{"quantile_alpha", "0.5"}});
    } else {
      objfn->Configure(Args{});
    }
    learner->Configure();
    learner->UpdateOneIter(0, Xy);
    learner->EvalOneIter(0, {Xy}, {"train"});
    Json config{Object{}};
    learner->SaveConfig(&config);
    auto jobj = get<Object const>(config["learner"]["objective"]);
    ASSERT_TRUE(jobj.find("name") != jobj.cend());
    // FIXME(jiamingy): We should have the following check, but some legacy parameter like
    // "pos_weight", "delta_step" in objectives are not in metrics.
    // if (jobj.size() > 1) {
    //   ASSERT_FALSE(IsA<Null>(objfn->DefaultMetricConfig()));
    // }
    auto mconfig = objfn->DefaultMetricConfig();
    if (!IsA<Null>(mconfig)) {
      // make sure metric can handle it
      std::unique_ptr<Metric> metricfn{Metric::Create(get<String const>(mconfig["name"]), &ctx_)};
      metricfn->LoadConfig(mconfig);
      Json loaded(Object{});
      metricfn->SaveConfig(&loaded);
      metricfn->Configure(Args{});
      ASSERT_EQ(mconfig, loaded);
    }
  }
 };
 TEST_P(TestDefaultObjConfig, Objective) {
  std::string objective = GetParam();
  this->Run(objective);
 }
 INSTANTIATE_TEST_SUITE_P(Objective, TestDefaultObjConfig,
                         ::testing::ValuesIn(MakeObjNamesForTest()),
                         [](const ::testing::TestParamInfo<TestDefaultObjConfig::ParamType>& info) {
                           return ObjTestNameGenerator(info);
                         });
 } // namespace xgboost
--- a/tests/cpp/objective_helpers.cc
+++ b/tests/cpp/objective_helpers.cc
@ -0,0 +1,31 @@
 /**
 * Copyright (c) 2023, XGBoost contributors
 */
 #include "objective_helpers.h"
 #include "../../src/common/linalg_op.h"  // for begin, end
 #include "helpers.h"                     // for RandomDataGenerator
 namespace xgboost {
 std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj) {
  auto constexpr kRows = 10, kCols = 10;
  auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
  auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
  auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
  h_lower.resize(kRows);
  h_upper.resize(kRows);
  for (size_t i = 0; i < kRows; ++i) {
    h_lower[i] = 1;
    h_upper[i] = 10;
  }
  if (obj.find("rank:") != std::string::npos) {
    auto h_label = p_fmat->Info().labels.HostView();
    std::size_t k = 0;
    for (auto& v : h_label) {
      v = k % 2 == 0;
      ++k;
    }
  }
  return p_fmat;
 };
 }  // namespace xgboost
--- a/tests/cpp/objective_helpers.h
+++ b/tests/cpp/objective_helpers.h
@ -1,6 +1,8 @@
 /**
 * Copyright (c) 2023, XGBoost contributors
 */
 #pragma once
 #include <dmlc/registry.h>  // for Registry
 #include <gtest/gtest.h>
 #include <xgboost/objective.h>  // for ObjFunctionReg
@ -29,4 +31,6 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
  }
  return name;
 };
 std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj);
 }  // namespace xgboost
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -655,33 +655,11 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
 TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
 class TestColumnSplit : public ::testing::TestWithParam<std::string> {
  static auto MakeFmat(std::string const& obj) {
    auto constexpr kRows = 10, kCols = 10;
    auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
    auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
    auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
    h_lower.resize(kRows);
    h_upper.resize(kRows);
    for (size_t i = 0; i < kRows; ++i) {
      h_lower[i] = 1;
      h_upper[i] = 10;
    }
    if (obj.find("rank:") != std::string::npos) {
      auto h_label = p_fmat->Info().labels.HostView();
      std::size_t k = 0;
      for (auto& v : h_label) {
        v = k % 2 == 0;
        ++k;
      }
    }
    return p_fmat;
  };
  void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
    auto const world_size = collective::GetWorldSize();
    auto const rank = collective::GetRank();
-    auto p_fmat = MakeFmat(objective);
+    auto p_fmat = MakeFmatForObjTest(objective);
    std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
    std::unique_ptr<Learner> learner{Learner::Create({sliced})};
    learner->SetParam("tree_method", "approx");
@ -705,7 +683,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
 public:
  void Run(std::string objective) {
-    auto p_fmat = MakeFmat(objective);
+    auto p_fmat = MakeFmatForObjTest(objective);
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
    learner->SetParam("tree_method", "approx");
    learner->SetParam("objective", objective);
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@ -191,14 +191,32 @@ class TestGPUPredict:
        np.testing.assert_allclose(predt_0, predt_3)
        np.testing.assert_allclose(predt_0, predt_4)
-    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
+    def run_inplace_base_margin(
        self, device: int, booster: xgb.Booster, dtrain: xgb.DMatrix, X, base_margin
    ) -> None:
        import cupy as cp
        booster.set_param({"device": f"cuda:{device}"})
        dtrain.set_info(base_margin=base_margin)
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        cp.testing.assert_allclose(from_inplace, from_dmatrix)
        booster = booster.copy()  # clear prediction cache.
        booster.set_param({"device": "cpu"})
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        cp.testing.assert_allclose(from_inplace, from_dmatrix)
        booster = booster.copy()  # clear prediction cache.
        base_margin = cp.asnumpy(base_margin)
        if hasattr(X, "values"):
            X = cp.asnumpy(X.values)
        booster.set_param({"device": f"cuda:{device}"})
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        cp.testing.assert_allclose(from_inplace, from_dmatrix, rtol=1e-6)
    def run_inplace_predict_cupy(self, device: int) -> None:
        import cupy as cp
@ -244,7 +262,7 @@ class TestGPUPredict:
            run_threaded_predict(X, rows, predict_dense)
        base_margin = cp_rng.randn(rows)
-        self.run_inplace_base_margin(booster, dtrain, X, base_margin)
+        self.run_inplace_base_margin(device, booster, dtrain, X, base_margin)
        # Create a wide dataset
        X = cp_rng.randn(100, 10000)
@ -318,7 +336,7 @@ class TestGPUPredict:
            run_threaded_predict(X, rows, predict_df)
        base_margin = cudf.Series(rng.randn(rows))
-        self.run_inplace_base_margin(booster, dtrain, X, base_margin)
+        self.run_inplace_base_margin(0, booster, dtrain, X, base_margin)
    @given(
        strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@ -9,7 +9,7 @@ import pytest
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 sys.path.append("tests/python")
 import test_with_sklearn as twskl  # noqa
@ -165,6 +165,11 @@ def test_ranking_qid_df():
    run_ranking_qid_df(cudf, "gpu_hist")
@pytest.mark.skipif(**tm.no_pandas())
 def test_ranking_categorical() -> None:
    run_ranking_categorical(device="cuda")
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
 def test_device_ordinal() -> None:
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@ -211,7 +211,7 @@ class TestPandas:
        y = np.random.randn(kRows)
        w = np.random.uniform(size=kRows).astype(np.float32)
        w_pd = pd.DataFrame(w)
-        data = xgb.DMatrix(X, y, w_pd)
+        data = xgb.DMatrix(X, y, weight=w_pd)
        assert data.num_row() == kRows
        assert data.num_col() == kCols
@ -301,14 +301,14 @@ class TestPandas:
    @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
    def test_nullable_type(self, DMatrixT) -> None:
-        from pandas.api.types import is_categorical_dtype
+        from xgboost.data import is_pd_cat_dtype
        for orig, df in pd_dtypes():
            if hasattr(df.dtypes, "__iter__"):
-                enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
+                enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
            else:
                # series
-                enable_categorical = is_categorical_dtype(df.dtype)
+                enable_categorical = is_pd_cat_dtype(df.dtype)
            f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
            f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@ -12,7 +12,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore
@ -173,6 +173,11 @@ def test_ranking():
    np.testing.assert_almost_equal(pred, pred_orig)
@pytest.mark.skipif(**tm.no_pandas())
 def test_ranking_categorical() -> None:
    run_ranking_categorical(device="cpu")
 def test_ranking_metric() -> None:
    from sklearn.metrics import roc_auc_score
@ -935,6 +940,7 @@ def save_load_model(model_path):
    predt_0 = clf.predict(X)
    clf.save_model(model_path)
    clf.load_model(model_path)
    assert clf.booster == "gblinear"
    predt_1 = clf.predict(X)
    np.testing.assert_allclose(predt_0, predt_1)
    assert clf.best_iteration == best_iteration
@ -950,25 +956,26 @@ def save_load_model(model_path):
 def test_save_load_model():
    with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model')
+        model_path = os.path.join(tempdir, "digits.model")
        save_load_model(model_path)
    with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model.json')
+        model_path = os.path.join(tempdir, "digits.model.json")
        save_load_model(model_path)
    from sklearn.datasets import load_digits
    from sklearn.model_selection import train_test_split
    with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model.ubj')
+        model_path = os.path.join(tempdir, "digits.model.ubj")
        digits = load_digits(n_class=2)
-        y = digits['target']
+        y = digits["target"]
-        X = digits['data']
+        X = digits["data"]
-        booster = xgb.train({'tree_method': 'hist',
+        booster = xgb.train(
-                             'objective': 'binary:logistic'},
+            {"tree_method": "hist", "objective": "binary:logistic"},
-                            dtrain=xgb.DMatrix(X, y),
+            dtrain=xgb.DMatrix(X, y),
-                            num_boost_round=4)
+            num_boost_round=4,
        )
        predt_0 = booster.predict(xgb.DMatrix(X))
        booster.save_model(model_path)
        cls = xgb.XGBClassifier()
@ -1002,6 +1009,8 @@ def test_save_load_model():
        clf = xgb.XGBClassifier()
        clf.load_model(model_path)
        assert clf.classes_.size == 10
        assert clf.objective == "multi:softprob"
        np.testing.assert_equal(clf.classes_, np.arange(10))
        assert clf.n_classes_ == 10
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@ -2,6 +2,7 @@ import json
 import logging
 import subprocess
 import numpy as np
 import pytest
 import sklearn
@ -13,7 +14,7 @@ from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
-from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
+from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor, SparkXGBRegressorModel
 gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
@ -242,3 +243,33 @@ def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature
    evaluator = RegressionEvaluator(metricName="rmse")
    rmse = evaluator.evaluate(pred_result_df)
    assert rmse <= 65.0
 def test_gpu_transform(spark_diabetes_dataset) -> None:
    regressor = SparkXGBRegressor(device="cuda", num_workers=num_workers)
    train_df, test_df = spark_diabetes_dataset
    model: SparkXGBRegressorModel = regressor.fit(train_df)
    # The model trained with GPUs, and transform with GPU configurations.
    assert model._gpu_transform()
    model.set_device("cpu")
    assert not model._gpu_transform()
    # without error
    cpu_rows = model.transform(test_df).select("prediction").collect()
    regressor = SparkXGBRegressor(device="cpu", num_workers=num_workers)
    model = regressor.fit(train_df)
    # The model trained with CPUs. Even with GPU configurations,
    # still prefer transforming with CPUs
    assert not model._gpu_transform()
    # Set gpu transform explicitly.
    model.set_device("cuda")
    assert model._gpu_transform()
    # without error
    gpu_rows = model.transform(test_df).select("prediction").collect()
    for cpu, gpu in zip(cpu_rows, gpu_rows):
        np.testing.assert_allclose(cpu.prediction, gpu.prediction, atol=1e-3)
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@ -1932,6 +1932,7 @@ class TestWithDask:
        cls.client = client
        cls.fit(X, y)
        predt_0 = cls.predict(X)
        proba_0 = cls.predict_proba(X)
        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, "model.pkl")
@ -1941,7 +1942,9 @@ class TestWithDask:
            with open(path, "rb") as fd:
                cls = pickle.load(fd)
            predt_1 = cls.predict(X)
            proba_1 = cls.predict_proba(X)
            np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
            np.testing.assert_allclose(proba_0.compute(), proba_1.compute())
            path = os.path.join(tmpdir, "cls.json")
            cls.save_model(path)
@ -1950,16 +1953,20 @@ class TestWithDask:
            cls.load_model(path)
            assert cls.n_classes_ == 10
            predt_2 = cls.predict(X)
            proba_2 = cls.predict_proba(X)
            np.testing.assert_allclose(predt_0.compute(), predt_2.compute())
            np.testing.assert_allclose(proba_0.compute(), proba_2.compute())
            # Use single node to load
            cls = xgb.XGBClassifier()
            cls.load_model(path)
            assert cls.n_classes_ == 10
            predt_3 = cls.predict(X_)
            proba_3 = cls.predict_proba(X_)
            np.testing.assert_allclose(predt_0.compute(), predt_3)
            np.testing.assert_allclose(proba_0.compute(), proba_3)
 def test_dask_unsupported_features(client: "Client") -> None:
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@ -888,6 +888,34 @@ class TestPySparkLocal:
        clf = SparkXGBClassifier(device="cuda")
        clf._validate_params()
    def test_gpu_transform(self, clf_data: ClfData) -> None:
        """local mode"""
        classifier = SparkXGBClassifier(device="cpu")
        model: SparkXGBClassifierModel = classifier.fit(clf_data.cls_df_train)
        with tempfile.TemporaryDirectory() as tmpdir:
            path = "file:" + tmpdir
            model.write().overwrite().save(path)
            # The model trained with CPU, transform defaults to cpu
            assert not model._gpu_transform()
            # without error
            model.transform(clf_data.cls_df_test).collect()
            model.set_device("cuda")
            assert model._gpu_transform()
            model_loaded = SparkXGBClassifierModel.load(path)
            # The model trained with CPU, transform defaults to cpu
            assert not model_loaded._gpu_transform()
            # without error
            model_loaded.transform(clf_data.cls_df_test).collect()
            model_loaded.set_device("cuda")
            assert model_loaded._gpu_transform()
 class XgboostLocalTest(SparkTestCase):
    def setUp(self):
Author	SHA1	Message	Date
Philip Hyunsu Cho	82d846bbeb	Update change_scala_version.py to also change scala.version property (#9897 )	2023-12-18 23:49:41 -08:00
Philip Hyunsu Cho	71d330afdc	Bump version to 2.0.3 (#9895 )	2023-12-14 17:54:05 -08:00
Philip Hyunsu Cho	3acbd8692b	[jvm-packages] Fix POM for xgboost-jvm metapackage (#9893 ) * [jvm-packages] Fix POM for xgboost-jvm metapackage * Add script for updating the Scala version	2023-12-14 16:50:34 -08:00
Philip Hyunsu Cho	ad524f76ab	[backport] [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9887 ) * [CI] Set up CI for Mac M1 (#9699) * [CI] Improve CI for Mac M1 (#9748) * [CI] Build libxgboost4j.dylib with CMAKE_OSX_DEPLOYMENT_TARGET (#9749) * [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9886)	2023-12-13 16:05:40 -08:00
Jiaming Yuan	d2d1751c03	[backport][py] Use the first found native library. (#9860 ) (#9879 )	2023-12-13 14:20:30 +08:00
Jiaming Yuan	e4ee4e79dc	[backport][sklearn] Fix loading model attributes. (#9808 ) (#9880 )	2023-12-13 14:20:04 +08:00
Philip Hyunsu Cho	41ce8f28b2	[jvm-packages] Add Scala version suffix to xgboost-jvm package (#9776 ) * Update JVM script (#9714) * Bump version to 2.0.2; revamp pom.xml * Update instructions in prepare_jvm_release.py * Fix formatting	2023-11-08 10:17:26 -08:00
Jiaming Yuan	0ffc52e05c	[backport] Fix using categorical data with the ranker. (#9753 ) (#9778 )	2023-11-09 01:20:52 +08:00
Philip Hyunsu Cho	a408254c2f	Use sys.base_prefix instead of sys.prefix (#9711 ) * Use sys.base_prefix instead of sys.prefix * Update libpath.py too	2023-10-23 23:31:40 -07:00
Philip Hyunsu Cho	22e891dafa	[jvm-packages] Remove hard dependency on libjvm (#9698 ) (#9705 )	2023-10-23 21:21:14 -07:00
Philip Hyunsu Cho	89530c80a7	[CI] Build libxgboost4j.dylib for Intel Mac (#9704 )	2023-10-23 20:45:01 -07:00
Philip Hyunsu Cho	946ab53b57	Fix libpath logic for Windows (#9687 )	2023-10-19 10:42:46 -07:00
Philip Hyunsu Cho	afd03a6934	Fix build for AppleClang 11 (#9684 )	2023-10-18 09:35:59 -07:00
Jiaming Yuan	f7da938458	[backport][pyspark] Support stage-level scheduling (#9519 ) (#9686 ) Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-10-18 14:05:08 +08:00
Philip Hyunsu Cho	6ab6577511	Fix build for GCC 8.x (#9670 )	2023-10-12 23:36:41 -07:00
Philip Hyunsu Cho	8c57558d74	[backport] [CI] Pull CentOS 7 images from NGC (#9666 ) (#9668 )	2023-10-13 14:09:54 +08:00
Jiaming Yuan	58aa98a796	Bump version to 2.0.1. (#9660 )	2023-10-13 08:47:32 +08:00
Jiaming Yuan	92273b39d8	[backport] Add support for cgroupv2. (#9651 ) (#9656 )	2023-10-12 11:39:27 +08:00
Jiaming Yuan	e824b18bf6	[backport] Support pandas 2.1.0. (#9557 ) (#9655 )	2023-10-12 11:29:59 +08:00
Jiaming Yuan	66ee89d8b4	[backport] Workaround Apple clang issue. (#9615 ) (#9636 )	2023-10-08 15:42:15 +08:00
Jiaming Yuan	54d1d72d01	[backport] Use array interface for testing numpy arrays. (#9602 ) (#9635 )	2023-10-08 11:45:49 +08:00
Jiaming Yuan	032bcc57f9	[backport][R] Fix method name. (#9577 ) (#9592 )	2023-09-19 02:08:46 +08:00
Jiaming Yuan	ace7713201	[backport] Fix default metric configuration. (#9575 ) (#9590 )	2023-09-18 23:40:43 +08:00
Jiaming Yuan	096047c547	Make 2.0 release. (#9567 )	2023-09-12 00:20:49 +08:00
Jiaming Yuan	e75dd75bb2	[backport] [pyspark] support gpu transform (#9542 ) (#9559 ) --------- Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-09-07 17:21:09 +08:00
Jiaming Yuan	4d387cbfbf	[backport] [pyspark] rework transform to reuse same code (#9292 ) (#9558 ) Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-09-07 15:26:24 +08:00
Jiaming Yuan	3fde9361d7	[backport] Fix inplace predict with fallback when base margin is used. (#9536 ) (#9548 ) - Copy meta info from proxy DMatrix. - Use `std::call_once` to emit less warnings.	2023-09-05 23:38:06 +08:00
Jiaming Yuan	b67c2ed96d	[backport] [CI] bump `setup-r` action version. (#9544 ) (#9551 )	2023-09-05 22:10:30 +08:00
Jiaming Yuan	177fd79864	[backport] Fix read the doc configuration. [skip ci] (#9549 )	2023-09-05 17:32:00 +08:00
Jiaming Yuan	06487d3896	[backport] Fix GPU categorical split memory allocation. (#9529 ) (#9535 )	2023-08-29 21:14:43 +08:00
Jiaming Yuan	e50ccc4d3c	[R] Fix integer inputs with NA. (#9522 ) (#9534 )	2023-08-29 19:52:13 +08:00
Jiaming Yuan	add57f8880	[backport] Delay the check for vector leaf. (#9509 ) (#9533 )	2023-08-29 18:25:59 +08:00
Jiaming Yuan	a0d3573c74	[backport] Fix device dispatch for linear updater. (#9507 ) (#9532 )	2023-08-29 15:10:43 +08:00
Jiaming Yuan	4301558a57	Make 2.0.0 RC1. (#9492 )	2023-08-17 16:16:51 +08:00