Fix typo. (#8192 )

Fix release script. (#8187 )
[backport] Fix loading DMatrix binary in distributed env. (#8149 ) (#8185 )
2022-08-22 16:36:38 +08:00 · 2022-08-22 01:07:54 +08:00 · 2022-08-19 04:11:12 +08:00 · 2022-08-17 22:06:07 -07:00 · 2022-08-18 04:13:10 +08:00 · 2022-08-16 14:38:15 +08:00
58 changed files with 541 additions and 323 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 1.6.1)
+project(xgboost LANGUAGES CXX C VERSION 1.6.2)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
@@ -200,6 +200,10 @@ endif (JVM_BINDINGS)
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)

+if (PLUGIN_RMM)
+  find_package(rmm REQUIRED)
+endif (PLUGIN_RMM)
+
 #-- library
 if (BUILD_STATIC_LIB)
  add_library(xgboost STATIC)
--- a/22
+++ b/22
@@ -7,7 +7,7 @@
 dockerRun = 'tests/ci_build/ci_build.sh'

 // Which CUDA version to use when building reference distribution wheel
-ref_cuda_ver = '11.0'
+ref_cuda_ver = '11.0.3'

 import groovy.transform.Field

@@ -60,9 +60,9 @@ pipeline {
            'build-cpu-rabit-mock': { BuildCPUMock() },
            // Build reference, distribution-ready Python wheel with CUDA 11.0
            // using CentOS 7 image
-            'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0', build_rmm: true) },
-            'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0') },
-            'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0') },
+            'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0.3', build_rmm: true) },
+            'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0.3') },
+            'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0.3') },
            'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.1') },
            'build-jvm-doc': { BuildJVMDoc() }
          ])
@@ -77,9 +77,9 @@ pipeline {
            'test-python-cpu': { TestPythonCPU() },
            'test-python-cpu-arm64': { TestPythonCPUARM64() },
            // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env
-            'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) },
-            'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', multi_gpu: true, test_rmm: true) },
-            'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) },
+            'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) },
+            'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', multi_gpu: true, test_rmm: true) },
+            'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) },
            'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') }
          ])
        }
@@ -123,7 +123,7 @@ def ClangTidy() {
    echo "Running clang-tidy job..."
    def container_type = "clang_tidy"
    def docker_binary = "docker"
-    def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0"
+    def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0.3"
    sh """
    ${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py --cuda-archs 75
    """
@@ -397,7 +397,7 @@ def TestCppGPU(args) {
  node(nodeReq) {
    unstash name: "xgboost_cpp_tests_cuda${artifact_cuda_version}"
    unstash name: 'srcs'
-    echo "Test C++, CUDA ${args.host_cuda_version}"
+    echo "Test C++, CUDA ${args.host_cuda_version}, rmm: ${args.test_rmm}"
    def container_type = "gpu"
    def docker_binary = "nvidia-docker"
    def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
@@ -410,7 +410,7 @@ def TestCppGPU(args) {
      docker_binary = "nvidia-docker"
      docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
      sh """
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool --gtest_filter=-*DeathTest.*"
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool"
      """
    }
    deleteDir()
@@ -445,7 +445,7 @@ def DeployJVMPackages(args) {
    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
      echo 'Deploying to xgboost-maven-repo S3 repo...'
      sh """
-      ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
+      ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0.3 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
      """
    }
    deleteDir()
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.6.0.1
+Version: 1.6.2.1
 Date: 2022-03-29
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -169,10 +169,17 @@ function(xgboost_set_cuda_flags target)
      $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
  endif (MSVC)

-  set_target_properties(${target} PROPERTIES
-    CUDA_STANDARD 14
-    CUDA_STANDARD_REQUIRED ON
-    CUDA_SEPARABLE_COMPILATION OFF)
+  if (PLUGIN_RMM)
+    set_target_properties(${target} PROPERTIES
+      CUDA_STANDARD 17
+      CUDA_STANDARD_REQUIRED ON
+      CUDA_SEPARABLE_COMPILATION OFF)
+  else ()
+    set_target_properties(${target} PROPERTIES
+      CUDA_STANDARD 14
+      CUDA_STANDARD_REQUIRED ON
+      CUDA_SEPARABLE_COMPILATION OFF)
+  endif (PLUGIN_RMM)
 endfunction(xgboost_set_cuda_flags)

 macro(xgboost_link_nccl target)
@@ -189,10 +196,18 @@ endmacro(xgboost_link_nccl)

 # compile options
 macro(xgboost_target_properties target)
-  set_target_properties(${target} PROPERTIES
-    CXX_STANDARD 14
-    CXX_STANDARD_REQUIRED ON
-    POSITION_INDEPENDENT_CODE ON)
+  if (PLUGIN_RMM)
+    set_target_properties(${target} PROPERTIES
+      CXX_STANDARD 17
+      CXX_STANDARD_REQUIRED ON
+      POSITION_INDEPENDENT_CODE ON)
+  else ()
+    set_target_properties(${target} PROPERTIES
+      CXX_STANDARD 14
+      CXX_STANDARD_REQUIRED ON
+      POSITION_INDEPENDENT_CODE ON)
+  endif (PLUGIN_RMM)
+
  if (HIDE_CXX_SYMBOLS)
    #-- Hide all C++ symbols
    set_target_properties(${target} PROPERTIES
@@ -247,6 +262,10 @@ macro(xgboost_target_defs target)
      PRIVATE
      -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
  endif (XGBOOST_BUILTIN_PREFETCH_PRESENT)
+
+  if (PLUGIN_RMM)
+    target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
+  endif (PLUGIN_RMM)
 endmacro(xgboost_target_defs)

 # handles dependencies
@@ -269,6 +288,10 @@ macro(xgboost_target_link_libraries target)
    xgboost_set_cuda_flags(${target})
  endif (USE_CUDA)

+  if (PLUGIN_RMM)
+    target_link_libraries(${target} PRIVATE rmm::rmm)
+  endif (PLUGIN_RMM)
+
  if (USE_NCCL)
    xgboost_link_nccl(${target})
  endif (USE_NCCL)
--- a/dev/release-py-r.py
+++ b/dev/release-py-r.py
@@ -75,7 +75,7 @@ def download_wheels(
    return filenames


-def download_py_packages(major: int, minor: int, commit_hash: str):
+def download_py_packages(branch: str, major: int, minor: int, commit_hash: str) -> None:
    platforms = [
        "win_amd64",
        "manylinux2014_x86_64",
@@ -84,7 +84,8 @@ def download_py_packages(major: int, minor: int, commit_hash: str):
        "macosx_12_0_arm64"
    ]

-    dir_URL = PREFIX + str(major) + "." + str(minor) + ".0" + "/"
+    branch = branch.split("_")[1]  # release_x.y.z
+    dir_URL = PREFIX + branch + "/"
    src_filename_prefix = "xgboost-" + args.release + "%2B" + commit_hash + "-py3-none-"
    target_filename_prefix = "xgboost-" + args.release + "-py3-none-"

@@ -105,16 +106,17 @@ Following steps should be done manually:
    )


-def download_r_packages(release: str, rc: str, commit: str) -> None:
+def download_r_packages(release: str, branch: str, rc: str, commit: str) -> None:
    platforms = ["win64", "linux"]
    dirname = "./r-packages"
    if not os.path.exists(dirname):
        os.mkdir(dirname)

    filenames = []
+    branch = branch.split("_")[1]  # release_x.y.z

    for plat in platforms:
-        url = f"{PREFIX}{release}/xgboost_r_gpu_{plat}_{commit}.tar.gz"
+        url = f"{PREFIX}{branch}/xgboost_r_gpu_{plat}_{commit}.tar.gz"

        if not rc:
            filename = f"xgboost_r_gpu_{plat}_{release}.tar.gz"
@@ -152,7 +154,11 @@ def main(args: argparse.Namespace) -> None:
        assert rc == "rc"

    release = str(major) + "." + str(minor) + "." + str(patch)
-    branch = "release_" + release
+    if args.branch is not None:
+        branch = args.branch
+    else:
+        branch = "release_" + str(major) + "." + str(minor) + ".0"
+
    git.clean("-xdf")
    git.checkout(branch)
    git.pull("origin", branch)
@@ -160,10 +166,10 @@ def main(args: argparse.Namespace) -> None:
    commit_hash = latest_hash()

    download_r_packages(
-        release, "" if rc is None else rc + str(rc_ver), commit_hash
+        release, branch, "" if rc is None else rc + str(rc_ver), commit_hash
    )

-    download_py_packages(major, minor, commit_hash)
+    download_py_packages(branch, major, minor, commit_hash)


 if __name__ == "__main__":
@@ -174,5 +180,14 @@ if __name__ == "__main__":
        required=True,
        help="Version tag, e.g. '1.3.2', or '1.5.0rc1'"
    )
+    parser.add_argument(
+        "--branch",
+        type=str,
+        default=None,
+        help=(
+            "Optional branch. Usually patch releases reuse the same branch of the"
+            " major release, but there can be exception."
+        )
+    )
    args = parser.parse_args()
    main(args)
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -6,6 +6,6 @@

 #define XGBOOST_VER_MAJOR 1
 #define XGBOOST_VER_MINOR 6
-#define XGBOOST_VER_PATCH 1
+#define XGBOOST_VER_PATCH 2

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@

    <groupId>ml.dmlc</groupId>
    <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.6.1</version>
+    <version>1.6.2</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.6.1</version>
+        <version>1.6.2</version>
    </parent>
    <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.6.1</version>
+    <version>1.6.2</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.6.1</version>
+            <version>1.6.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.6.1</version>
+            <version>1.6.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.6.1</version>
+        <version>1.6.2</version>
    </parent>
    <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.6.1</version>
+    <version>1.6.2</version>
    <build>
        <plugins>
            <plugin>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.6.1</version>
+            <version>1.6.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.6.1</version>
+        <version>1.6.2</version>
    </parent>
    <artifactId>xgboost4j-gpu_2.12</artifactId>
-    <version>1.6.1</version>
+    <version>1.6.2</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.6.1</version>
+        <version>1.6.2</version>
    </parent>
    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>1.6.1</version>
+            <version>1.6.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.6.1</version>
+        <version>1.6.2</version>
    </parent>
    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.6.1</version>
+            <version>1.6.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.6.1</version>
+        <version>1.6.2</version>
    </parent>
    <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.6.1</version>
+    <version>1.6.2</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -2,19 +2,6 @@ if (PLUGIN_DENSE_PARSER)
  target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
 endif (PLUGIN_DENSE_PARSER)

-if (PLUGIN_RMM)
-  find_path(RMM_INCLUDE "rmm" HINTS "$ENV{RMM_ROOT}/include")
-  if (NOT RMM_INCLUDE)
-    message(FATAL_ERROR "Could not locate RMM library")
-  endif ()
-
-  message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}")
-  message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
-
-  target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE})
-  target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
-endif (PLUGIN_RMM)
-
 if (PLUGIN_UPDATER_ONEAPI)
  add_library(oneapi_plugin OBJECT
    ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc
--- a/python-package/MANIFEST.in
+++ b/python-package/MANIFEST.in
@@ -18,6 +18,8 @@ recursive-include xgboost/include *
 recursive-include xgboost/plugin *
 recursive-include xgboost/src *

+recursive-include xgboost/gputreeshap/GPUTreeShap *
+
 include xgboost/rabit/CMakeLists.txt
 recursive-include xgboost/rabit/include *
 recursive-include xgboost/rabit/src *
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -61,6 +61,7 @@ def copy_tree(src_dir: str, target_dir: str) -> None:
    src = os.path.join(src_dir, 'src')
    inc = os.path.join(src_dir, 'include')
    dmlc_core = os.path.join(src_dir, 'dmlc-core')
+    gputreeshap = os.path.join(src_dir, "gputreeshap")
    rabit = os.path.join(src_dir, 'rabit')
    cmake = os.path.join(src_dir, 'cmake')
    plugin = os.path.join(src_dir, 'plugin')
@@ -68,6 +69,7 @@ def copy_tree(src_dir: str, target_dir: str) -> None:
    clean_copy_tree(src, os.path.join(target_dir, 'src'))
    clean_copy_tree(inc, os.path.join(target_dir, 'include'))
    clean_copy_tree(dmlc_core, os.path.join(target_dir, 'dmlc-core'))
+    clean_copy_tree(gputreeshap, os.path.join(target_dir, "gputreeshap"))
    clean_copy_tree(rabit, os.path.join(target_dir, 'rabit'))
    clean_copy_tree(cmake, os.path.join(target_dir, 'cmake'))
    clean_copy_tree(plugin, os.path.join(target_dir, 'plugin'))
@@ -97,7 +99,7 @@ class BuildExt(build_ext.build_ext):  # pylint: disable=too-many-ancestors

    logger = logging.getLogger('XGBoost build_ext')

-    # pylint: disable=too-many-arguments,no-self-use
+    # pylint: disable=too-many-arguments
    def build(
        self,
        src_dir: str,
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-1.6.1
+1.6.2
--- a/python-package/xgboost/init.py
+++ b/python-package/xgboost/init.py
@@ -1,12 +1,16 @@
-# coding: utf-8
 """XGBoost: eXtreme Gradient Boosting library.

 Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
 """

-import os
-
-from .core import DMatrix, DeviceQuantileDMatrix, Booster, DataIter, build_info
+from .core import (
+    DMatrix,
+    DeviceQuantileDMatrix,
+    Booster,
+    DataIter,
+    build_info,
+    _py_version,
+)
 from .training import train, cv
 from . import rabit  # noqa
 from . import tracker  # noqa
@@ -21,9 +25,9 @@ try:
 except ImportError:
    pass

-VERSION_FILE = os.path.join(os.path.dirname(__file__), "VERSION")
-with open(VERSION_FILE, encoding="ascii") as f:
-    __version__ = f.read().strip()
+
+__version__ = _py_version()
+

 __all__ = [
    # core
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-# pylint: disable=invalid-name, too-many-statements, no-self-use
+# pylint: disable=invalid-name, too-many-statements
 # pylint: disable=too-many-arguments
 """Callback library containing training routines.  See :doc:`Callback Functions
 </python/callbacks>` for a quick introduction.
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3,6 +3,7 @@
 """Core XGBoost Library."""
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
+import copy
 from typing import List, Optional, Any, Union, Dict, TypeVar
 from typing import Callable, Tuple, cast, Sequence, Type, Iterable
 import ctypes
@@ -136,6 +137,22 @@ def _get_log_callback_func() -> Callable:
    return c_callback(_log_callback)


+def _lib_version(lib: ctypes.CDLL) -> Tuple[int, int, int]:
+    """Get the XGBoost version from native shared object."""
+    major = ctypes.c_int()
+    minor = ctypes.c_int()
+    patch = ctypes.c_int()
+    lib.XGBoostVersion(ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch))
+    return major.value, minor.value, patch.value
+
+
+def _py_version() -> str:
+    """Get the XGBoost version from Python version file."""
+    VERSION_FILE = os.path.join(os.path.dirname(__file__), "VERSION")
+    with open(VERSION_FILE, encoding="ascii") as f:
+        return f.read().strip()
+
+
 def _load_lib() -> ctypes.CDLL:
    """Load xgboost Library."""
    lib_paths = find_lib_path()
@@ -143,7 +160,7 @@ def _load_lib() -> ctypes.CDLL:
        # This happens only when building document.
        return None  # type: ignore
    try:
-        pathBackup = os.environ['PATH'].split(os.pathsep)
+        pathBackup = os.environ["PATH"].split(os.pathsep)
    except KeyError:
        pathBackup = []
    lib_success = False
@@ -152,15 +169,16 @@ def _load_lib() -> ctypes.CDLL:
        try:
            # needed when the lib is linked with non-system-available
            # dependencies
-            os.environ['PATH'] = os.pathsep.join(
-                pathBackup + [os.path.dirname(lib_path)])
+            os.environ["PATH"] = os.pathsep.join(
+                pathBackup + [os.path.dirname(lib_path)]
+            )
            lib = ctypes.cdll.LoadLibrary(lib_path)
            lib_success = True
        except OSError as e:
            os_error_list.append(str(e))
            continue
        finally:
-            os.environ['PATH'] = os.pathsep.join(pathBackup)
+            os.environ["PATH"] = os.pathsep.join(pathBackup)
    if not lib_success:
        libname = os.path.basename(lib_paths[0])
        raise XGBoostError(
@@ -176,11 +194,36 @@ Likely causes:
  * You are running 32-bit Python on a 64-bit OS

 Error message(s): {os_error_list}
-""")
+"""
+        )
    lib.XGBGetLastError.restype = ctypes.c_char_p
    lib.callback = _get_log_callback_func()  # type: ignore
    if lib.XGBRegisterLogCallback(lib.callback) != 0:
        raise XGBoostError(lib.XGBGetLastError())
+
+    def parse(ver: str) -> Tuple[int, int, int]:
+        """Avoid dependency on packaging (PEP 440)."""
+        # 2.0.0-dev or 2.0.0
+        major, minor, patch = ver.split("-")[0].split(".")
+        return int(major), int(minor), int(patch)
+
+    libver = _lib_version(lib)
+    pyver = parse(_py_version())
+
+    # verify that we are loading the correct binary.
+    if pyver != libver:
+        pyver_str = ".".join((str(v) for v in pyver))
+        libver_str = ".".join((str(v) for v in libver))
+        msg = (
+            "Mismatched version between the Python package and the native shared "
+            f"""object.  Python package version: {pyver_str}. Shared object """
+            f"""version: {libver_str}. Shared object is loaded from: {lib.path}.
+Likely cause:
+  * XGBoost is first installed with anaconda then upgraded with pip. To fix it """
+            "please remove one of the installations."
+        )
+        raise ValueError(msg)
+
    return lib


@@ -1402,10 +1445,12 @@ class Booster:
        self.set_param(params_processed or {})

    def _transform_monotone_constrains(
-        self, value: Union[Dict[str, int], str]
+        self, value: Union[Dict[str, int], str, Tuple[int, ...]]
    ) -> Union[Tuple[int, ...], str]:
        if isinstance(value, str):
            return value
+        if isinstance(value, tuple):
+            return value

        constrained_features = set(value.keys())
        feature_names = self.feature_names or []
@@ -1577,7 +1622,7 @@ class Booster:
        booster: `Booster`
            a copied booster model
        """
-        return self.__copy__()
+        return copy.copy(self)

    def attr(self, key: str) -> Optional[str]:
        """Get attribute string from the Booster.
@@ -2309,15 +2354,15 @@ class Booster:
        ret = self.get_dump(fmap, with_stats, dump_format)
        if dump_format == 'json':
            fout_obj.write('[\n')
-            for i, _ in enumerate(ret):
-                fout_obj.write(ret[i])
+            for i, val in enumerate(ret):
+                fout_obj.write(val)
                if i < len(ret) - 1:
                    fout_obj.write(",\n")
            fout_obj.write('\n]')
        else:
-            for i, _ in enumerate(ret):
+            for i, val in enumerate(ret):
                fout_obj.write(f"booster[{i}]:\n")
-                fout_obj.write(ret[i])
+                fout_obj.write(val)
        if need_close:
            fout_obj.close()

@@ -2604,8 +2649,8 @@ class Booster:
        values = []
        # pylint: disable=consider-using-f-string
        regexp = re.compile(r"\[{0}<([\d.Ee+-]+)\]".format(feature))
-        for i, _ in enumerate(xgdump):
-            m = re.findall(regexp, xgdump[i])
+        for i, val in enumerate(xgdump):
+            m = re.findall(regexp, val)
            values.extend([float(x) for x in m])

        n_unique = len(np.unique(values))
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -177,9 +177,11 @@ def _try_start_tracker(
                use_logger=False,
            )
        else:
-            assert isinstance(addrs[0], str) or addrs[0] is None
+            addr = addrs[0]
+            assert isinstance(addr, str) or addr is None
+            host_ip = get_host_ip(addr)
            rabit_context = RabitTracker(
-                host_ip=get_host_ip(addrs[0]), n_workers=n_workers, use_logger=False
+                host_ip=host_ip, n_workers=n_workers, use_logger=False, sortby="task"
            )
        env.update(rabit_context.worker_envs())
        rabit_context.start(n_workers)
@@ -229,8 +231,16 @@ class RabitContext:
    def __init__(self, args: List[bytes]) -> None:
        self.args = args
        worker = distributed.get_worker()
+        with distributed.worker_client() as client:
+            info = client.scheduler_info()
+            w = info["workers"][worker.address]
+            wid = w["id"]
+        # We use task ID for rank assignment which makes the RABIT rank consistent (but
+        # not the same as task ID is string and "10" is sorted before "2") with dask
+        # worker ID. This outsources the rank assignment to dask and prevents
+        # non-deterministic issue.
        self.args.append(
-            ("DMLC_TASK_ID=[xgboost.dask]:" + str(worker.address)).encode()
+            (f"DMLC_TASK_ID=[xgboost.dask-{wid}]:" + str(worker.address)).encode()
        )

    def __enter__(self) -> None:
@@ -870,6 +880,8 @@ async def _get_rabit_args(
    except Exception:  # pylint: disable=broad-except
        sched_addr = None

+    # make sure all workers are online so that we can obtain reliable scheduler_info
+    client.wait_for_workers(n_workers)
    env = await client.run_on_scheduler(
        _start_tracker, n_workers, sched_addr, user_addr
    )
@@ -1721,7 +1733,7 @@ class DaskScikitLearnBase(XGBModel):
    """Implementation of the Scikit-Learn API for XGBoost.""", ["estimators", "model"]
 )
 class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
-    # pylint: disable=missing-class-docstring
+    """dummy doc string to workaround pylint, replaced by the decorator."""
    async def _fit_async(
        self,
        X: _DaskCollection,
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -2,7 +2,6 @@
 # pylint: disable=too-many-return-statements, import-error
 '''Data dispatching for DMatrix.'''
 import ctypes
-from distutils import version
 import json
 import warnings
 import os
@@ -506,7 +505,6 @@ def record_batch_data_iter(data_iter: Iterator) -> Callable:
    """Data iterator used to ingest Arrow columnar record batches. We are not using
    class DataIter because it is only intended for building Device DMatrix and external
    memory DMatrix.
-
    """
    from pyarrow.cffi import ffi

@@ -557,13 +555,7 @@ def _from_arrow(
    if enable_categorical:
        raise ValueError("categorical data in arrow is not supported yet.")

-    major, _, _ = version.StrictVersion(pa.__version__).version
-    if major == 4:
-        rb_iter = iter(data.to_batches())
-    else:
-        # use_async=True to workaround pyarrow 6.0.1 hang,
-        # see Modin-3982 and ARROW-15362
-        rb_iter = iter(data.to_batches(use_async=True))
+    rb_iter = iter(data.to_batches())
    it = record_batch_data_iter(rb_iter)
    next_callback = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)(it)
    handle = ctypes.c_void_p()
@@ -714,9 +706,10 @@ def _is_cudf_ser(data):
    return isinstance(data, cudf.Series)


-def _is_cupy_array(data: Any) -> bool:
-    return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
-        data, "cupy._core.core", "ndarray"
+def _is_cupy_array(data) -> bool:
+    return any(
+        lazy_isinstance(data, n, "ndarray")
+        for n in ("cupy.core.core", "cupy", "cupy._core.core")
    )


--- a/python-package/xgboost/tracker.py
+++ b/python-package/xgboost/tracker.py
@@ -32,15 +32,15 @@ class ExSocket:
            chunk = self.sock.recv(min(nbytes - nread, 1024))
            nread += len(chunk)
            res.append(chunk)
-        return b''.join(res)
+        return b"".join(res)

    def recvint(self) -> int:
        """Receive an integer of 32 bytes"""
-        return struct.unpack('@i', self.recvall(4))[0]
+        return struct.unpack("@i", self.recvall(4))[0]

    def sendint(self, value: int) -> None:
        """Send an integer of 32 bytes"""
-        self.sock.sendall(struct.pack('@i', value))
+        self.sock.sendall(struct.pack("@i", value))

    def sendstr(self, value: str) -> None:
        """Send a Python string"""
@@ -69,6 +69,7 @@ def get_family(addr: str) -> int:

 class WorkerEntry:
    """Hanlder to each worker."""
+
    def __init__(self, sock: socket.socket, s_addr: Tuple[str, int]):
        worker = ExSocket(sock)
        self.sock = worker
@@ -78,7 +79,7 @@ class WorkerEntry:
        worker.sendint(MAGIC_NUM)
        self.rank = worker.recvint()
        self.world_size = worker.recvint()
-        self.jobid = worker.recvstr()
+        self.task_id = worker.recvstr()
        self.cmd = worker.recvstr()
        self.wait_accept = 0
        self.port: Optional[int] = None
@@ -96,8 +97,8 @@ class WorkerEntry:
        """Get the rank of current entry."""
        if self.rank >= 0:
            return self.rank
-        if self.jobid != 'NULL' and self.jobid in job_map:
-            return job_map[self.jobid]
+        if self.task_id != "NULL" and self.task_id in job_map:
+            return job_map[self.task_id]
        return -1

    def assign_rank(
@@ -180,7 +181,12 @@ class RabitTracker:
    """

    def __init__(
-        self, host_ip: str, n_workers: int, port: int = 0, use_logger: bool = False
+        self,
+        host_ip: str,
+        n_workers: int,
+        port: int = 0,
+        use_logger: bool = False,
+        sortby: str = "host",
    ) -> None:
        """A Python implementation of RABIT tracker.

@@ -190,6 +196,13 @@ class RabitTracker:
            Use logging.info for tracker print command.  When set to False, Python print
            function is used instead.

+        sortby:
+            How to sort the workers for rank assignment. The default is host, but users
+            can set the `DMLC_TASK_ID` via RABIT initialization arguments and obtain
+            deterministic rank assignment. Available options are:
+              - host
+              - task
+
        """
        sock = socket.socket(get_family(host_ip), socket.SOCK_STREAM)
        sock.bind((host_ip, port))
@@ -200,6 +213,7 @@ class RabitTracker:
        self.thread: Optional[Thread] = None
        self.n_workers = n_workers
        self._use_logger = use_logger
+        self._sortby = sortby
        logging.info("start listen on %s:%d", host_ip, self.port)

    def __del__(self) -> None:
@@ -223,7 +237,7 @@ class RabitTracker:
        get environment variables for workers
        can be passed in as args or envs
        """
-        return {'DMLC_TRACKER_URI': self.host_ip, 'DMLC_TRACKER_PORT': self.port}
+        return {"DMLC_TRACKER_URI": self.host_ip, "DMLC_TRACKER_PORT": self.port}

    def _get_tree(self, n_workers: int) -> Tuple[_TreeMap, Dict[int, int]]:
        tree_map: _TreeMap = {}
@@ -296,8 +310,16 @@ class RabitTracker:
                parent_map_[rmap[k]] = -1
        return tree_map_, parent_map_, ring_map_

+    def _sort_pending(self, pending: List[WorkerEntry]) -> List[WorkerEntry]:
+        if self._sortby == "host":
+            pending.sort(key=lambda s: s.host)
+        elif self._sortby == "task":
+            pending.sort(key=lambda s: s.task_id)
+        return pending
+
    def accept_workers(self, n_workers: int) -> None:
        """Wait for all workers to connect to the tracker."""
+
        # set of nodes that finishes the job
        shutdown: Dict[int, WorkerEntry] = {}
        # set of nodes that is waiting for connections
@@ -341,27 +363,32 @@ class RabitTracker:
                assert todo_nodes
                pending.append(s)
                if len(pending) == len(todo_nodes):
-                    pending.sort(key=lambda x: x.host)
+                    pending = self._sort_pending(pending)
                    for s in pending:
                        rank = todo_nodes.pop(0)
-                        if s.jobid != 'NULL':
-                            job_map[s.jobid] = rank
+                        if s.task_id != "NULL":
+                            job_map[s.task_id] = rank
                        s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
                        if s.wait_accept > 0:
                            wait_conn[rank] = s
-                        logging.debug('Received %s signal from %s; assign rank %d',
-                                      s.cmd, s.host, s.rank)
+                        logging.debug(
+                            "Received %s signal from %s; assign rank %d",
+                            s.cmd,
+                            s.host,
+                            s.rank,
+                        )
                if not todo_nodes:
-                    logging.info('@tracker All of %d nodes getting started', n_workers)
+                    logging.info("@tracker All of %d nodes getting started", n_workers)
            else:
                s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
-                logging.debug('Received %s signal from %d', s.cmd, s.rank)
+                logging.debug("Received %s signal from %d", s.cmd, s.rank)
                if s.wait_accept > 0:
                    wait_conn[rank] = s
-        logging.info('@tracker All nodes finishes job')
+        logging.info("@tracker All nodes finishes job")

    def start(self, n_workers: int) -> None:
        """Strat the tracker, it will wait for `n_workers` to connect."""
+
        def run() -> None:
            self.accept_workers(n_workers)

--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -193,9 +193,7 @@ XGB_DLL int XGBGetGlobalConfig(const char** json_str) {
  API_END();
 }

-XGB_DLL int XGDMatrixCreateFromFile(const char *fname,
-                                    int silent,
-                                    DMatrixHandle *out) {
+XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle *out) {
  API_BEGIN();
  bool load_row_split = false;
  if (rabit::IsDistributed()) {
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -115,7 +115,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
  }
  }
  CHECK_EQ(
-      std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies<>{}),
+      std::accumulate(shape.cbegin(), shape.cend(), static_cast<bst_ulong>(1), std::multiplies<>{}),
      chunksize * rows);
 }

--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -38,6 +38,9 @@ void AllReducer::Init(int _device_ordinal) {

  int32_t const rank = rabit::GetRank();
  int32_t const world = rabit::GetWorldSize();
+  if (world == 1) {
+    return;
+  }

  std::vector<uint64_t> uuids(world * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -775,13 +775,16 @@ class AllReducer {
   */

  void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm_, stream_));
    allreduce_bytes_ += count * sizeof(double);
    allreduce_calls_ += 1;
-#endif
+#endif  // XGBOOST_USE_NCCL
  }

  /**
@@ -796,9 +799,12 @@ class AllReducer {

  void AllGather(uint32_t const* data, size_t length,
                 dh::caching_device_vector<uint32_t>* recvbuf) {
+    size_t world = rabit::GetWorldSize();
+    if (world == 1) {
+      return;
+    }
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);
-    size_t world = rabit::GetWorldSize();
    recvbuf->resize(length * world);
    safe_nccl(ncclAllGather(data, recvbuf->data().get(), length, ncclUint32,
                            comm_, stream_));
@@ -813,9 +819,11 @@ class AllReducer {
   * \param recvbuff                The recvbuff.
   * \param count                   Number of elements.
   */
-
  void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm_, stream_));
@@ -836,6 +844,9 @@ class AllReducer {

  void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
@@ -845,6 +856,9 @@ class AllReducer {

  void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
@@ -853,6 +867,9 @@ class AllReducer {
  }

  void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);

@@ -867,12 +884,15 @@ class AllReducer {
            std::enable_if_t<std::is_same<size_t, T>::value &&
                             !std::is_same<size_t, unsigned long long>::value>  // NOLINT
                * = nullptr>
-  void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
+  void AllReduceSum(const T *sendbuff, T *recvbuff, int count) {  // NOLINT
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");  // NOLINT
    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
 #endif
  }
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -184,8 +184,6 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
  dh::safe_cuda(cudaSetDevice(device));
  info.weights_.SetDevice(device);
  auto weights = info.weights_.ConstDeviceSpan();
-  dh::caching_device_vector<bst_group_t> group_ptr(info.group_ptr_);
-  auto d_group_ptr = dh::ToSpan(group_ptr);

  auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
    thrust::make_counting_iterator(0llu),
@@ -205,9 +203,13 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
  auto d_temp_weights = dh::ToSpan(temp_weights);

  if (is_ranking) {
+    if (!weights.empty()) {
+      CHECK_EQ(weights.size(), info.group_ptr_.size() - 1);
+    }
+    dh::caching_device_vector<bst_group_t> group_ptr(info.group_ptr_);
+    auto d_group_ptr = dh::ToSpan(group_ptr);
    auto const weight_iter = dh::MakeTransformIterator<float>(
-        thrust::make_constant_iterator(0lu),
-        [=]__device__(size_t idx) -> float {
+        thrust::make_counting_iterator(0lu), [=] __device__(size_t idx) -> float {
          auto ridx = batch.GetElement(idx).row_idx;
          bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
          return weights[group_idx];
@@ -272,7 +274,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
  size_t num_cols = batch.NumCols();
  size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
  int32_t device = sketch_container->DeviceIdx();
-  bool weighted = info.weights_.Size() != 0;
+  bool weighted = !info.weights_.Empty();

  if (weighted) {
    sketch_batch_num_elements = detail::SketchBatchNumElements(
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -122,27 +122,6 @@ std::vector<float> MergeWeights(MetaInfo const &info, Span<float const> hessian,
  }
  return results;
 }
-
-std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
-  std::vector<float> const &group_weights = info.weights_.HostVector();
-  if (group_weights.empty()) {
-    return group_weights;
-  }
-
-  size_t n_samples = info.num_row_;
-  auto const &group_ptr = info.group_ptr_;
-  std::vector<float> results(n_samples);
-  CHECK_GE(group_ptr.size(), 2);
-  CHECK_EQ(group_ptr.back(), n_samples);
-  size_t cur_group = 0;
-  for (size_t i = 0; i < n_samples; ++i) {
-    results[i] = group_weights[cur_group];
-    if (i == group_ptr[cur_group + 1]) {
-      cur_group++;
-    }
-  }
-  return results;
-}
 }  // anonymous namespace

 template <typename WQSketch>
@@ -156,12 +135,10 @@ void SketchContainerImpl<WQSketch>::PushRowPage(SparsePage const &page, MetaInfo

  // glue these conditions using ternary operator to avoid making data copies.
  auto const &weights =
-      hessian.empty()
-          ? (use_group_ind_ ? UnrollGroupWeights(info)     // use group weight
-                            : info.weights_.HostVector())  // use sample weight
-          : MergeWeights(
-                info, hessian, use_group_ind_,
-                n_threads_);  // use hessian merged with group/sample weights
+      hessian.empty() ? (use_group_ind_ ? detail::UnrollGroupWeights(info)  // use group weight
+                                        : info.weights_.HostVector())       // use sample weight
+                      : MergeWeights(info, hessian, use_group_ind_,
+                                     n_threads_);  // use hessian merged with group/sample weights
  if (!weights.empty()) {
    CHECK_EQ(weights.size(), info.num_row_);
  }
@@ -563,8 +540,8 @@ void SortedSketchContainer::PushColPage(SparsePage const &page, MetaInfo const &
  monitor_.Start(__func__);
  // glue these conditions using ternary operator to avoid making data copies.
  auto const &weights =
-      hessian.empty() ? (use_group_ind_ ? UnrollGroupWeights(info)     // use group weight
-                                        : info.weights_.HostVector())  // use sample weight
+      hessian.empty() ? (use_group_ind_ ? detail::UnrollGroupWeights(info)  // use group weight
+                                        : info.weights_.HostVector())       // use sample weight
                      : MergeWeights(info, hessian, use_group_ind_,
                                     n_threads_);  // use hessian merged with group/sample weights
  CHECK_EQ(weights.size(), info.num_row_);
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -697,6 +697,29 @@ class WXQuantileSketch :
      public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> > {
 };

+namespace detail {
+inline std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
+  std::vector<float> const &group_weights = info.weights_.HostVector();
+  if (group_weights.empty()) {
+    return group_weights;
+  }
+
+  size_t n_samples = info.num_row_;
+  auto const &group_ptr = info.group_ptr_;
+  std::vector<float> results(n_samples);
+  CHECK_GE(group_ptr.size(), 2);
+  CHECK_EQ(group_ptr.back(), n_samples);
+  size_t cur_group = 0;
+  for (size_t i = 0; i < n_samples; ++i) {
+    results[i] = group_weights[cur_group];
+    if (i == group_ptr[cur_group + 1]) {
+      cur_group++;
+    }
+  }
+  return results;
+}
+}  // namespace detail
+
 class HistogramCuts;

 /*!
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -378,35 +378,6 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
  return out;
 }

-// try to load group information from file, if exists
-inline bool MetaTryLoadGroup(const std::string& fname,
-                             std::vector<unsigned>* group) {
-  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
-  if (fi == nullptr) return false;
-  dmlc::istream is(fi.get());
-  group->clear();
-  group->push_back(0);
-  unsigned nline = 0;
-  while (is >> nline) {
-    group->push_back(group->back() + nline);
-  }
-  return true;
-}
-
-// try to load weight information from file, if exists
-inline bool MetaTryLoadFloatInfo(const std::string& fname,
-                                 std::vector<bst_float>* data) {
-  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
-  if (fi == nullptr) return false;
-  dmlc::istream is(fi.get());
-  data->clear();
-  bst_float value;
-  while (is >> value) {
-    data->push_back(value);
-  }
-  return true;
-}
-
 namespace {
 template <int32_t D, typename T>
 void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
@@ -811,9 +782,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
  return nullptr;
 }

-DMatrix* DMatrix::Load(const std::string& uri,
-                       bool silent,
-                       bool load_row_split,
+DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,
                       const std::string& file_format) {
  std::string fname, cache_file;
  size_t dlm_pos = uri.find('#');
@@ -846,50 +815,47 @@ DMatrix* DMatrix::Load(const std::string& uri,
  } else {
    fname = uri;
  }
+
+  // legacy handling of binary data loading
+  if (file_format == "auto") {
+    DMatrix* loaded = TryLoadBinary(fname, silent);
+    if (loaded) {
+      return loaded;
+    }
+  }
+
  int partid = 0, npart = 1;
  if (load_row_split) {
    partid = rabit::GetRank();
    npart = rabit::GetWorldSize();
  } else {
    // test option to load in part
-    npart = dmlc::GetEnv("XGBOOST_TEST_NPART", 1);
+    npart = 1;
  }

  if (npart != 1) {
-    LOG(CONSOLE) << "Load part of data " << partid
-                 << " of " << npart << " parts";
-  }
-
-  // legacy handling of binary data loading
-  if (file_format == "auto" && npart == 1) {
-    DMatrix *loaded = TryLoadBinary(fname, silent);
-    if (loaded) {
-      return loaded;
-    }
+    LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
  }

  DMatrix* dmat {nullptr};
  try {
    if (cache_file.empty()) {
      std::unique_ptr<dmlc::Parser<uint32_t>> parser(
-          dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart,
-                                         file_format.c_str()));
+          dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
      data::FileAdapter adapter(parser.get());
-      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(),
-                             1, cache_file);
+      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, cache_file);
    } else {
      data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
                              file_format};
-      dmat = new data::SparsePageDMatrix{
-          &iter,
-          iter.Proxy(),
-          data::fileiter::Reset,
-          data::fileiter::Next,
-          std::numeric_limits<float>::quiet_NaN(),
-          1,
-          cache_file};
+      dmat = new data::SparsePageDMatrix{&iter,
+                                         iter.Proxy(),
+                                         data::fileiter::Reset,
+                                         data::fileiter::Next,
+                                         std::numeric_limits<float>::quiet_NaN(),
+                                         1,
+                                         cache_file};
    }
-  } catch (dmlc::Error &e) {
+  } catch (dmlc::Error& e) {
    std::vector<std::string> splited = common::Split(fname, '#');
    std::vector<std::string> args = common::Split(splited.front(), '?');
    std::string format {file_format};
@@ -917,24 +883,6 @@ DMatrix* DMatrix::Load(const std::string& uri,
   * partitioned data will fail the train/val validation check
   * since partitioned data not knowing the real number of features. */
  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
-  // backward compatiblity code.
-  if (!load_row_split) {
-    MetaInfo& info = dmat->Info();
-    if (MetaTryLoadGroup(fname + ".group", &info.group_ptr_) && !silent) {
-      LOG(CONSOLE) << info.group_ptr_.size() - 1
-                   << " groups are loaded from " << fname << ".group";
-    }
-    if (MetaTryLoadFloatInfo(fname + ".base_margin", &info.base_margin_.Data()->HostVector()) &&
-        !silent) {
-      LOG(CONSOLE) << info.base_margin_.Size() << " base_margin are loaded from " << fname
-                   << ".base_margin";
-    }
-    if (MetaTryLoadFloatInfo
-        (fname + ".weight", &info.weights_.HostVector()) && !silent) {
-      LOG(CONSOLE) << info.weights_.Size()
-                   << " weights are loaded from " << fname << ".weight";
-    }
-  }
  return dmat;
 }
 template <typename DataIterHandle, typename DMatrixHandle,
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -130,12 +130,12 @@ void MetaInfo::SetInfoFromCUDA(Context const&, StringView key, Json array) {
  }
  // uint info
  if (key == "group") {
-    auto array_interface{ArrayInterface<1>(array)};
+    ArrayInterface<1> array_interface{array};
    CopyGroupInfoImpl(array_interface, &group_ptr_);
    data::ValidateQueryGroup(group_ptr_);
    return;
  } else if (key == "qid") {
-    auto array_interface{ArrayInterface<1>(array)};
+    ArrayInterface<1> array_interface{array};
    CopyQidImpl(array_interface, &group_ptr_);
    data::ValidateQueryGroup(group_ptr_);
    return;
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2018-2020 by Contributors
+ * Copyright 2018-2022 by Contributors
 * \file metric_common.h
 */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
@@ -9,6 +9,7 @@
 #include <string>

 #include "../common/common.h"
+#include "xgboost/metric.h"

 namespace xgboost {

--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -27,7 +27,7 @@ DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);

 /*! \brief Evaluate rank list on GPU */
 template <typename EvalMetricT>
-struct EvalRankGpu : public Metric, public EvalRankConfig {
+struct EvalRankGpu : public GPUMetric, public EvalRankConfig {
 public:
  double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
              bool distributed) override {
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -211,12 +211,13 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
      n_nodes = this->max_leaves * 2 - 1;
    } else {
      // bst_node_t will overflow.
-      CHECK_LE(this->max_depth, 31)
-          << "max_depth can not be greater than 31 as that might generate 2 ** "
-             "32 - 1 nodes.";
-      n_nodes = (1 << (this->max_depth + 1)) - 1;
+      CHECK_LE(this->max_depth, 30)
+          << "max_depth can not be greater than 30 as that might generate 2^31 - 1"
+             "nodes.";
+      // same as: (1 << (max_depth + 1)) - 1, but avoids 1 << 31, which overflows.
+      n_nodes = (1 << this->max_depth) + ((1 << this->max_depth) - 1);
    }
-    CHECK_NE(n_nodes, 0);
+    CHECK_GT(n_nodes, 0);
    return n_nodes;
  }
 };
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -7,6 +7,7 @@ ENV DEBIAN_FRONTEND noninteractive

 # Install all basic requirements
 RUN \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
                       apt-transport-https ca-certificates gnupg-agent && \
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -10,13 +10,13 @@ RUN \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:ubuntu-toolchain-r/test && \
    apt-get update && \
-    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 && \
+    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
    # CMake
    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
    # Python
-    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash Miniconda3.sh -b -p /opt/python
+    wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
+    bash Mambaforge-Linux-x86_64.sh -b -p /opt/python

 ENV PATH=/opt/python/bin:$PATH
 ENV CC=gcc-8
@@ -24,10 +24,11 @@ ENV CXX=g++-8
 ENV CPP=cpp-8

 ENV GOSU_VERSION 1.10
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

 # Create new Conda environment
 COPY conda_env/cpu_test.yml /scripts/
-RUN conda env create -n cpu_test --file=/scripts/cpu_test.yml
+RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml

 # Install lightweight sudo (not bound to TTY)
 RUN set -ex; \
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -8,8 +8,9 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell

 # Install all basic requirements
 RUN \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
    apt-get update && \
-    apt-get install -y wget unzip bzip2 libgomp1 build-essential && \
+    apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
    # Python
    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
    bash Miniconda3.sh -b -p /opt/python
@@ -18,11 +19,14 @@ ENV PATH=/opt/python/bin:$PATH

 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
-    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.8 cudf=21.10* rmm=21.10* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=21.10* dask-cudf=21.10* cupy=9.1* \
-        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
+    conda install -c conda-forge mamba && \
+    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+        python=3.8 cudf=22.04* rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=22.04* dask-cudf=22.04* cupy \
+        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
+        pyspark cloudpickle cuda-python=11.7.0

 ENV GOSU_VERSION 1.10
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

 # Install lightweight sudo (not bound to TTY)
 RUN set -ex; \
--- a/tests/ci_build/Dockerfile.gpu_build
+++ b/tests/ci_build/Dockerfile.gpu_build
@@ -8,6 +8,7 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell

 # Install all basic requirements
 RUN \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:ubuntu-toolchain-r/test && \
@@ -23,7 +24,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.5-1 && \
+    export NCCL_VERSION=2.13.4-1 && \
    apt-get update && \
    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}

--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -4,6 +4,8 @@ ARG CUDA_VERSION_ARG

 # Install all basic requirements
 RUN \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
+        > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
    yum install -y epel-release centos-release-scl && \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum -y update && \
@@ -19,7 +21,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.3-1 && \
+    export NCCL_VERSION=2.13.4-1 && \
    wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    yum -y update && \
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -4,6 +4,8 @@ ARG CUDA_VERSION_ARG

 # Install all basic requirements
 RUN \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
+        > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
    yum install -y epel-release centos-release-scl && \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum -y update && \
--- a/tests/ci_build/Dockerfile.gpu_jvm
+++ b/tests/ci_build/Dockerfile.gpu_jvm
@@ -9,6 +9,7 @@ ENV DEBIAN_FRONTEND noninteractive

 # Install all basic requirements
 RUN \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:openjdk-r/ppa && \
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -4,6 +4,8 @@ ARG CUDA_VERSION_ARG

 # Install all basic requirements
 RUN \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
+        > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
    yum install -y epel-release centos-release-scl && \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum -y update && \
@@ -22,12 +24,10 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.8.3-1 && \
-    wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
-    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
+    export NCCL_VERSION=2.13.4-1 && \
+    yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    yum -y update && \
-    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
-    rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
+    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}

 ENV PATH=/opt/python/bin:/opt/maven/bin:$PATH
 ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -8,19 +8,17 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell

 # Install all basic requirements
 RUN \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
    # Python
    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash Miniconda3.sh -b -p /opt/python && \
-    # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    bash Miniconda3.sh -b -p /opt/python

 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.5-1 && \
+    export NCCL_VERSION=2.13.4-1 && \
    apt-get update && \
    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}

@@ -29,7 +27,7 @@ ENV PATH=/opt/python/bin:$PATH
 # Create new Conda environment with RMM
 RUN \
    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.8 rmm=21.10* cudatoolkit=$CUDA_VERSION_ARG
+        python=3.9 rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG cmake

 ENV GOSU_VERSION 1.10

--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -29,13 +29,15 @@ if [[ "$platform_id" == macosx_* ]]; then
        setup_env_var='CIBW_TARGET_OSX_ARM64=1'  # extra flag to be passed to setup.py
        export PYTHON_CROSSENV=1
        export MACOSX_DEPLOYMENT_TARGET=12.0
-        OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2"
+        #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2"
+        OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hf3c4609_1-osx-arm64.tar.bz2"
    elif [[ "$platform_id" == macosx_x86_64 ]]; then
        # MacOS, Intel
        wheel_tag=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
        cpython_ver=37
        export MACOSX_DEPLOYMENT_TARGET=10.13
-        OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
+        #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
+        OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hda6cdc1_1-osx-64.tar.bz2"
    else
        echo "Platform not supported: $platform_id"
        exit 3
--- a/tests/ci_build/conda_env/cpu_test.yml
+++ b/tests/ci_build/conda_env/cpu_test.yml
@@ -30,14 +30,12 @@ dependencies:
 - jsonschema
 - boto3
 - awscli
- numba
- llvmlite
 - py-ubjson
 - cffi
 - pyarrow
+- protobuf
+- pyspark>=3.3.0
+- cloudpickle
+- shap
 - pip:
-  - shap
-  - ipython                     # required by shap at import time.
-  - sphinx_rtd_theme
  - datatable
-  - modin[all]
--- a/tests/ci_build/entrypoint.sh
+++ b/tests/ci_build/entrypoint.sh
@@ -20,9 +20,9 @@ else
 fi

 if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then
-    groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+    groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true
    useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \
-        "${CI_BUILD_USER}"
+        "${CI_BUILD_USER}" || true
    export HOME="/home/${CI_BUILD_USER}"
    shopt -s dotglob
    cp -r /root/* "$HOME/"
--- a/tests/ci_build/rename_whl.py
+++ b/tests/ci_build/rename_whl.py
@@ -42,4 +42,4 @@ with cd(dirname):

    filesize = os.path.getsize(new_name) / 1024 / 1024  # MB
    msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
-    assert filesize <= 200, msg
+    assert filesize <= 300, msg
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -381,6 +381,7 @@ void TestSketchFromWeights(bool with_group) {
  ValidateCuts(cuts, m.get(), kBins);

  if (with_group) {
+    m->Info().weights_ = decltype(m->Info().weights_)();  // remove weight
    HistogramCuts non_weighted = SketchOnDMatrix(m.get(), kBins, common::OmpGetNumThreads(0));
    for (size_t i = 0; i < cuts.Values().size(); ++i) {
      EXPECT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
@@ -392,6 +393,17 @@ void TestSketchFromWeights(bool with_group) {
      ASSERT_EQ(cuts.Ptrs().at(i), non_weighted.Ptrs().at(i));
    }
  }
+
+  if (with_group) {
+    auto& h_weights = info.weights_.HostVector();
+    h_weights.resize(kGroups);
+    // Generate different weight.
+    for (size_t i = 0; i < h_weights.size(); ++i) {
+      h_weights[i] = static_cast<float>(i + 1) / static_cast<float>(kGroups);
+    }
+    HistogramCuts weighted = SketchOnDMatrix(m.get(), kBins, common::OmpGetNumThreads(0));
+    ValidateCuts(weighted, m.get(), kBins);
+  }
 }

 TEST(HistUtil, SketchFromWeights) {
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -593,9 +593,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
  ValidateCuts(cuts, dmat.get(), kBins);

  if (with_group) {
+    dmat->Info().weights_ = decltype(dmat->Info().weights_)();  // remove weight
    HistogramCuts non_weighted = DeviceSketch(0, dmat.get(), kBins, 0);
    for (size_t i = 0; i < cuts.Values().size(); ++i) {
-      EXPECT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
+      ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
    }
    for (size_t i = 0; i < cuts.MinValues().size(); ++i) {
      ASSERT_EQ(cuts.MinValues()[i], non_weighted.MinValues()[i]);
@@ -604,6 +605,24 @@ void TestAdapterSketchFromWeights(bool with_group) {
      ASSERT_EQ(cuts.Ptrs().at(i), non_weighted.Ptrs().at(i));
    }
  }
+
+  if (with_group) {
+    common::HistogramCuts weighted;
+    auto& h_weights = info.weights_.HostVector();
+    h_weights.resize(kGroups);
+    // Generate different weight.
+    for (size_t i = 0; i < h_weights.size(); ++i) {
+      // FIXME(jiamingy): Some entries generated GPU test cannot pass the validate cuts if
+      // we use more diverse weights, partially caused by
+      // https://github.com/dmlc/xgboost/issues/7946
+      h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
+    }
+    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+    AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
+                        &sketch_container);
+    sketch_container.MakeCuts(&weighted);
+    ValidateCuts(weighted, dmat.get(), kBins);
+  }
 }

 TEST(HistUtil, AdapterSketchFromWeights) {
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -98,7 +98,11 @@ inline void TestBinDistribution(const HistogramCuts &cuts, int column_idx,
                                int num_bins) {
  std::map<int, int> bin_weights;
  for (auto i = 0ull; i < sorted_column.size(); i++) {
-    bin_weights[cuts.SearchBin(sorted_column[i], column_idx)] += sorted_weights[i];
+    auto bin_idx = cuts.SearchBin(sorted_column[i], column_idx);
+    if (bin_weights.find(bin_idx) == bin_weights.cend()) {
+      bin_weights[bin_idx] = 0;
+    }
+    bin_weights.at(bin_idx) += sorted_weights[i];
  }
  int local_num_bins = cuts.Ptrs()[column_idx + 1] - cuts.Ptrs()[column_idx];
  auto total_weight = std::accumulate(sorted_weights.begin(), sorted_weights.end(),0);
@@ -176,8 +180,7 @@ inline void ValidateColumn(const HistogramCuts& cuts, int column_idx,
  }
 }

-inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
-                         int num_bins) {
+inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat, int num_bins) {
  // Collect data into columns
  std::vector<std::vector<float>> columns(dmat->Info().num_col_);
  for (auto& batch : dmat->GetBatches<SparsePage>()) {
@@ -189,17 +192,22 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
      }
    }
  }
+
+  // construct weights.
+  std::vector<float> w = dmat->Info().group_ptr_.empty() ? dmat->Info().weights_.HostVector()
+                                                         : detail::UnrollGroupWeights(dmat->Info());
+
  // Sort
  for (auto i = 0ull; i < columns.size(); i++) {
    auto& col = columns.at(i);
-    const auto& w = dmat->Info().weights_.HostVector();
-    std::vector<size_t > index(col.size());
+    std::vector<size_t> index(col.size());
    std::iota(index.begin(), index.end(), 0);
-    std::sort(index.begin(), index.end(),
-              [=](size_t a, size_t b) { return col[a] < col[b]; });
+    std::sort(index.begin(), index.end(), [=](size_t a, size_t b) { return col[a] < col[b]; });

    std::vector<float> sorted_column(col.size());
    std::vector<float> sorted_weights(col.size(), 1.0);
+    const auto& w = dmat->Info().weights_.HostVector();
+
    for (auto j = 0ull; j < col.size(); j++) {
      sorted_column[j] = col[index[j]];
      if (w.size() == col.size()) {
--- a/tests/cpp/common/test_span.cu
+++ b/tests/cpp/common/test_span.cu
@@ -252,7 +252,7 @@ __global__ void TestLastStaticKernel(Span<float> _span) {
  _span.last(static_cast<Span<float>::index_type>(-1));
 }

-TEST(GPUSpan, FirstLast) {
+TEST(GPUSpanDeathTest, FirstLast) {
  // We construct vectors multiple times since thrust can not recover from
  // death test.
  auto lambda_first_dy = []() {
@@ -312,40 +312,37 @@ TEST(GPUSpan, FirstLast) {
  output = testing::internal::GetCapturedStdout();
 }

-__global__ void TestFrontKernel(Span<float> _span)  {
-  _span.front();
-}
-
-__global__ void TestBackKernel(Span<float> _span)  {
-  _span.back();
-}
-
-TEST(GPUSpan, FrontBack) {
-  dh::safe_cuda(cudaSetDevice(0));
-
+namespace {
+void TestFrontBack() {
  Span<float> s;
-  auto lambda_test_front = [=]() {
-    // make sure the termination happens inside this test.
-    try {
-      TestFrontKernel<<<1, 1>>>(s);
-      dh::safe_cuda(cudaDeviceSynchronize());
-      dh::safe_cuda(cudaGetLastError());
-    } catch (dmlc::Error const& e) {
-      std::terminate();
-    }
-  };
-  EXPECT_DEATH(lambda_test_front(), "");
+  EXPECT_DEATH(
+      {
+        // make sure the termination happens inside this test.
+        try {
+          dh::LaunchN(1, [=] __device__(size_t) { s.front(); });
+          dh::safe_cuda(cudaDeviceSynchronize());
+          dh::safe_cuda(cudaGetLastError());
+        } catch (dmlc::Error const& e) {
+          std::terminate();
+        }
+      },
+      "");
+  EXPECT_DEATH(
+      {
+        try {
+          dh::LaunchN(1, [=] __device__(size_t) { s.back(); });
+          dh::safe_cuda(cudaDeviceSynchronize());
+          dh::safe_cuda(cudaGetLastError());
+        } catch (dmlc::Error const& e) {
+          std::terminate();
+        }
+      },
+      "");
+}
+}  // namespace

-  auto lambda_test_back = [=]() {
-    try {
-      TestBackKernel<<<1, 1>>>(s);
-      dh::safe_cuda(cudaDeviceSynchronize());
-      dh::safe_cuda(cudaGetLastError());
-    } catch (dmlc::Error const& e) {
-      std::terminate();
-    }
-  };
-  EXPECT_DEATH(lambda_test_back(), "");
+TEST(GPUSpanDeathTest, FrontBack) {
+  TestFrontBack();
 }

 __global__ void TestSubspanDynamicKernel(Span<float> _span) {
@@ -354,7 +351,7 @@ __global__ void TestSubspanDynamicKernel(Span<float> _span) {
 __global__ void TestSubspanStaticKernel(Span<float> _span) {
  _span.subspan<16>();
 }
-TEST(GPUSpan, Subspan) {
+TEST(GPUSpanDeathTest, Subspan) {
  auto lambda_subspan_dynamic = []() {
    thrust::host_vector<float> h_vec (4);
    InitializeRange(h_vec.begin(), h_vec.end());
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -82,7 +82,7 @@ def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:

    cp.testing.assert_allclose(single_node, predictions.compute())
    np.testing.assert_allclose(single_node,
-                               series_predictions.compute().to_array())
+                               series_predictions.compute().to_numpy())

    predt = dxgb.predict(client, out, X)
    assert isinstance(predt, dd.Series)
--- a/tests/python-gpu/test_large_input.py
+++ b/tests/python-gpu/test_large_input.py
@@ -18,4 +18,6 @@ def test_large_input():
    X = cp.ones((m, n), dtype=np.float32)
    y = cp.ones(m)
    dmat = xgb.DeviceQuantileDMatrix(X, y)
-    xgb.train({"tree_method": "gpu_hist", "max_depth": 1}, dmat, 1)
+    booster = xgb.train({"tree_method": "gpu_hist", "max_depth": 1}, dmat, 1)
+    del y
+    booster.inplace_predict(X)
--- a/tests/python/test_monotone_constraints.py
+++ b/tests/python/test_monotone_constraints.py
@@ -93,6 +93,11 @@ class TestMonotoneConstraints:
        constrained = xgb.train(params_for_constrained, training_dset)
        assert is_correctly_constrained(constrained)

+    def test_monotone_constraints_tuple(self) -> None:
+        params_for_constrained = {"monotone_constraints": (1, -1)}
+        constrained = xgb.train(params_for_constrained, training_dset)
+        assert is_correctly_constrained(constrained)
+
    @pytest.mark.parametrize('format', [dict, list])
    def test_monotone_constraints_feature_names(self, format):

--- a/tests/python/test_tracker.py
+++ b/tests/python/test_tracker.py
@@ -4,6 +4,7 @@ import pytest
 import testing as tm
 import numpy as np
 import sys
+import re

 if sys.platform.startswith("win"):
    pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -59,3 +60,34 @@ def test_rabit_ops():
    with LocalCluster(n_workers=n_workers) as cluster:
        with Client(cluster) as client:
            run_rabit_ops(client, n_workers)
+
+
+def test_rank_assignment() -> None:
+    from distributed import Client, LocalCluster
+    from test_with_dask import _get_client_workers
+
+    def local_test(worker_id):
+        with xgb.dask.RabitContext(args):
+            for val in args:
+                sval = val.decode("utf-8")
+                if sval.startswith("DMLC_TASK_ID"):
+                    task_id = sval
+                    break
+            matched = re.search(".*-([0-9]).*", task_id)
+            rank = xgb.rabit.get_rank()
+            # As long as the number of workers is lesser than 10, rank and worker id
+            # should be the same
+            assert rank == int(matched.group(1))
+
+    with LocalCluster(n_workers=8) as cluster:
+        with Client(cluster) as client:
+            workers = _get_client_workers(client)
+            args = client.sync(
+                xgb.dask._get_rabit_args,
+                len(workers),
+                None,
+                client,
+            )
+
+            futures = client.map(local_test, range(len(workers)), workers=workers)
+            client.gather(futures)
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -10,10 +10,10 @@ exact_parameter_strategy = strategies.fixed_dictionaries({
    'nthread': strategies.integers(1, 4),
    'max_depth': strategies.integers(1, 11),
    'min_child_weight': strategies.floats(0.5, 2.0),
-    'alpha': strategies.floats(0.0, 2.0),
+    'alpha': strategies.floats(1e-5, 2.0),
    'lambda': strategies.floats(1e-5, 2.0),
    'eta': strategies.floats(0.01, 0.5),
-    'gamma': strategies.floats(0.0, 2.0),
+    'gamma': strategies.floats(1e-5, 2.0),
    'seed': strategies.integers(0, 10),
    # We cannot enable subsampling as the training loss can increase
    # 'subsample': strategies.floats(0.5, 1.0),
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -1,6 +1,7 @@
 """Copyright 2019-2022 XGBoost contributors"""
 from pathlib import Path
 import pickle
+import socket
 import testing as tm
 import pytest
 import xgboost as xgb
@@ -110,9 +111,10 @@ def make_categorical(


 def generate_array(
-        with_weights: bool = False
-) -> Tuple[xgb.dask._DaskCollection, xgb.dask._DaskCollection,
-           Optional[xgb.dask._DaskCollection]]:
+    with_weights: bool = False,
+) -> Tuple[
+    xgb.dask._DaskCollection, xgb.dask._DaskCollection, Optional[xgb.dask._DaskCollection]
+]:
    chunk_size = 20
    rng = da.random.RandomState(1994)
    X = rng.random_sample((kRows, kCols), chunks=(chunk_size, -1))
@@ -1189,6 +1191,50 @@ def test_dask_iteration_range(client: "Client"):


 class TestWithDask:
+    def test_dmatrix_binary(self, client: "Client") -> None:
+        def save_dmatrix(rabit_args: List[bytes], tmpdir: str) -> None:
+            with xgb.dask.RabitContext(rabit_args):
+                rank = xgb.rabit.get_rank()
+                X, y = tm.make_categorical(100, 4, 4, False)
+                Xy = xgb.DMatrix(X, y, enable_categorical=True)
+                path = os.path.join(tmpdir, f"{rank}.bin")
+                Xy.save_binary(path)
+
+        def load_dmatrix(rabit_args: List[bytes], tmpdir: str) -> None:
+            with xgb.dask.RabitContext(rabit_args):
+                rank = xgb.rabit.get_rank()
+                path = os.path.join(tmpdir, f"{rank}.bin")
+                Xy = xgb.DMatrix(path)
+                assert Xy.num_row() == 100
+                assert Xy.num_col() == 4
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            workers = _get_client_workers(client)
+            rabit_args = client.sync(
+                xgb.dask._get_rabit_args, len(workers), None, client
+            )
+            futures = []
+            for w in workers:
+                # same argument for each worker, must set pure to False otherwise dask
+                # will try to reuse the result from the first worker and hang waiting
+                # for it.
+                f = client.submit(
+                    save_dmatrix, rabit_args, tmpdir, workers=[w], pure=False
+                )
+                futures.append(f)
+            client.gather(futures)
+
+            rabit_args = client.sync(
+                xgb.dask._get_rabit_args, len(workers), None, client
+            )
+            futures = []
+            for w in workers:
+                f = client.submit(
+                    load_dmatrix, rabit_args, tmpdir, workers=[w], pure=False
+                )
+                futures.append(f)
+            client.gather(futures)
+
    @pytest.mark.parametrize('config_key,config_value', [('verbosity', 0), ('use_rmm', True)])
    def test_global_config(
            self,
@@ -1240,11 +1286,11 @@ class TestWithDask:
        os.remove(after_fname)

        with dask.config.set({'xgboost.foo': "bar"}):
-            with pytest.raises(ValueError):
+            with pytest.raises(ValueError, match=r"Unknown configuration.*"):
                xgb.dask.train(client, {}, dtrain, num_boost_round=4)

-        with dask.config.set({'xgboost.scheduler_address': "127.0.0.1:22"}):
-            with pytest.raises(PermissionError):
+        with dask.config.set({'xgboost.scheduler_address': "127.0.0.1:foo"}):
+            with pytest.raises(socket.gaierror, match=r".*not known.*"):
                xgb.dask.train(client, {}, dtrain, num_boost_round=1)

    def run_updater_test(
Author	SHA1	Message	Date
Jiaming Yuan	b9934246fa	Fix typo. (#8192 )	2022-08-22 16:36:38 +08:00
Jiaming Yuan	1fbb4524d2	Fix release script. (#8187 )	2022-08-22 01:07:54 +08:00
Jiaming Yuan	0fd6391a77	[backport] Fix loading DMatrix binary in distributed env. (#8149 ) (#8185 ) * Fix loading DMatrix binary in distributed env. (#8149) - Try to load DMatrix binary before trying to parse text input. - Remove some unmaintained code. * Fix.	2022-08-19 04:11:12 +08:00
Philip Hyunsu Cho	922d2137dd	[CI] Fix R build on Jenkins. (#8154 ) (#8180 ) Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>	2022-08-17 22:06:07 -07:00
Jiaming Yuan	7036d4f22b	Disable modin test on 1.6.0 branch. (#8176 )	2022-08-18 04:13:10 +08:00
Jiaming Yuan	2d54f7d58f	Make 1.6.2 patch release. (#8175 )	2022-08-16 14:38:15 +08:00
Jiaming Yuan	51c330159a	[backport] Fix LTR with weighted Quantile DMatrix. (#7975 ) (#8170 ) * Fix LTR with weighted Quantile DMatrix. * Better tests.	2022-08-15 17:50:16 +08:00
Jiaming Yuan	e82162d7f8	[backport] Fix Python package source install. (#8036 ) (#8171 ) * Copy gputreeshap.	2022-08-15 15:19:00 +08:00
Jiaming Yuan	b18c984035	[dask] Deterministic rank assignment. (#8018 ) (#8165 )	2022-08-15 15:18:26 +08:00
Jiaming Yuan	2e6444b342	[backport] Limit `max_depth` to 30 for GPU. (#8098 ) (#8169 )	2022-08-15 15:16:58 +08:00
Jiaming Yuan	0e2b5c467e	Verify shared object version at load. (#7928 ) (#8168 )	2022-08-15 15:16:22 +08:00
Jiaming Yuan	97d89c3ca1	[dask] Use an invalid port for test. (#8064 ) (#8167 )	2022-08-15 12:23:12 +08:00
Jiaming Yuan	9d816d9988	[CI] Test with latest RAPIDS. (#7816 ) (#8164 )	2022-08-13 01:06:52 +08:00
Jiaming Yuan	9c653378e2	Fix monotone constraint with tuple input. (#7891 ) (#8159 )	2022-08-12 22:05:53 +08:00
Jiaming Yuan	140c377a96	[backport] Fix compatibility with latest cupy. (#8129 ) (#8160 ) * Fix compatibility with latest cupy. * Freeze mypy.	2022-08-12 22:02:05 +08:00
Jiaming Yuan	39c1488a42	[backport] Update CUDA docker image and NCCL. (#8139 ) (#8162 ) * Update CUDA docker image and NCCL. (#8139) * Rest of the CI. * CPU test dependencies.	2022-08-12 18:57:42 +08:00
Jiaming Yuan	a55d3bdde2	[backport] Fix pylint errors. (#7967 ) (#7981 ) * Fix pylint errors. (#7967) * Rebase error.	2022-06-07 23:09:53 +08:00
Jiaming Yuan	5973c6e74e	Fix rmm build (#7973 ) (#7977 ) - Optionally switch to c++17 - Use rmm CMake target. - Workaround compiler errors. - Fix GPUMetric inheritance. - Run death tests even if it's built with RMM support. Co-authored-by: jakirkham <jakirkham@gmail.com> Co-authored-by: jakirkham <jakirkham@gmail.com>	2022-06-07 14:20:50 +08:00
Jiaming Yuan	b7c3fc9182	Fix overflow in prediction size. (#7885 ) (#7980 )	2022-06-07 12:30:41 +08:00
Jiaming Yuan	645855e8b1	[backport] Fix arrow compatibility, hypothesis tests. (#7979 )	2022-06-07 01:47:45 +08:00
Jiaming Yuan	eefa1ddd8a	[CI] Rotate package repository keys (#7943 ) (#7978 ) Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2022-06-07 00:00:54 +08:00
@@ -1 +1 @@
 .6.1
 .6.2