Make 1.4.2 patch release. (#6962 )

[back port] Fix multiclass auc with empty dataset. (#6947 ) (#6960 )
[CI] Fix CI/CD pipeline broken by latest auditwheel (4.0.0) (#6951 ) (#6952 )
2021-05-13 16:17:14 +08:00 · 2021-05-13 12:31:52 +08:00 · 2021-05-11 20:45:04 +08:00 · 2021-05-04 12:43:42 -07:00 · 2021-05-04 16:10:16 +08:00 · 2021-05-04 16:09:49 +08:00
61 changed files with 635 additions and 232 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -31,7 +31,10 @@ jobs:
    - name: Run gtest binary
      run: |
        cd build
-        ctest --exclude-regex AllTestsInDMLCUnitTests --extra-verbose
+        # libomp internal error:
+        #   OMP: Error #131: Thread identifier invalid.
+        ./testxgboost  --gtest_filter="-HistIndexCreationWithExternalMemory.Test"
+        ctest -R TestXGBoostCLI --extra-verbose

  gtest-cpu-nonomp:
    name: Test Google C++ unittest (CPU Non-OMP)
@ -279,7 +282,7 @@ jobs:
      run: |
        cd build/
        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
-        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/ --acl public-read
+        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@ -100,7 +100,7 @@ jobs:

    - name: Install system packages
      run: |
-        sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev
+        sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc

    - name: Install dependencies
      shell: Rscript {0}
--- a/.travis.yml
+++ b/.travis.yml
@ -19,18 +19,10 @@ jobs:
      env: TASK=java_test

 # dependent brew packages
+# the dependencies from homebrew is installed manually from setup script due to outdated image from travis.
 addons:
  homebrew:
-    packages:
-      - cmake
-      - libomp
-      - graphviz
-      - openssl
-      - libgit2
-      - lz4
-      - wget
-      - r
-    update: true
+    update: false
  apt:
    packages:
      - snapd
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.13)
-project(xgboost LANGUAGES CXX C VERSION 1.4.0)
+project(xgboost LANGUAGES CXX C VERSION 1.4.2)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
--- a/22
+++ b/22
@ -65,6 +65,7 @@ pipeline {
            'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
            'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },
            'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
+            'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '10.0') },
            'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
            'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
            'build-jvm-doc': { BuildJVMDoc() }
@ -178,7 +179,7 @@ def BuildCPUARM64() {
    ${dockerRun} ${container_type} ${docker_binary} bash -c "cd build && ctest --extra-verbose"
    ${dockerRun} ${container_type} ${docker_binary} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
    ${dockerRun} ${container_type} ${docker_binary} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} ${wheel_tag}
-    ${dockerRun} ${container_type} ${docker_binary} auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl
+    ${dockerRun} ${container_type} ${docker_binary} bash -c "auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl && python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag}"
    mv -v wheelhouse/*.whl python-package/dist/
    # Make sure that libgomp.so is vendored in the wheel
    ${dockerRun} ${container_type} ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
@ -230,6 +231,7 @@ def BuildCUDA(args) {
    if (args.cuda_version == ref_cuda_ver) {
      sh """
      ${dockerRun} auditwheel_x86_64 ${docker_binary} auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag}
      mv -v wheelhouse/*.whl python-package/dist/
      # Make sure that libgomp.so is vendored in the wheel
      ${dockerRun} auditwheel_x86_64 ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp  || exit -1"
@ -264,6 +266,24 @@ def BuildCUDA(args) {
  }
 }

+def BuildRPackageWithCUDA(args) {
+  node('linux && cpu_build') {
+    unstash name: 'srcs'
+    def container_type = 'gpu_build_r_centos6'
+    def docker_binary = "docker"
+    def docker_args = "--build-arg CUDA_VERSION_ARG=10.0"
+    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
+      sh """
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_r_pkg_with_cuda.sh ${commit_id}
+      """
+      echo 'Uploading R tarball...'
+      path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
+      s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', includePathPattern:'xgboost_r_gpu_linux_*.tar.gz'
+    }
+    deleteDir()
+  }
+}
+
 def BuildJVMPackagesWithCUDA(args) {
  node('linux && mgpu') {
    unstash name: 'srcs'
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.4.0.1
+Version: 1.4.2.1
 Date: 2020-08-28
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
--- a/cmake/Python_version.in
+++ b/cmake/Python_version.in
@ -1 +1 @@
-@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@-SNAPSHOT
+@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@
--- a/dev/release-pypi.py
+++ b/dev/release-pypi.py
@ -49,7 +49,7 @@ def download_wheels(
    dir_URL: str,
    src_filename_prefix: str,
    target_filename_prefix: str,
-) -> None:
+) -> List[str]:
    """Download all binary wheels. dir_URL is the URL for remote directory storing the release
    wheels

@ -72,6 +72,8 @@ def download_wheels(
        assert stderr.find("warning") == -1, "Unresolved warnings:\n" + stderr
        assert stdout.find("warning") == -1, "Unresolved warnings:\n" + stdout

+    return filenames
+

 def check_path():
    root = os.path.abspath(os.path.curdir)
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 243d91587d080f97f416433b18c2454179df867c
+Subproject commit f00e3ec7abc9f293a1b7061157b0a4e22a735cf5
--- a/doc/build.rst
+++ b/doc/build.rst
@ -2,18 +2,15 @@
 Installation Guide
 ##################

-.. note:: Pre-built binary wheel for Python
+.. note:: Pre-built binary wheel for Python: now with GPU support

-  If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, available from Python Package Index (PyPI). You may download and install it by running
+  If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, to avoid the trouble of building XGBoost from the source. You may download and install it by running

  .. code-block:: bash

-    # Ensure that you are downloading one of the following:
-    #   * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
-    #   * xgboost-{version}-py2.py3-none-win_amd64.whl
    pip3 install xgboost

-  * The binary wheel will support GPU algorithms (`gpu_hist`) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
+  * The binary wheel will support the GPU algorithm (``gpu_hist``) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
  * Currently, we provide binary wheels for 64-bit Linux, macOS and Windows.
  * Nightly builds are available. You can go to `this page
    <https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/list.html>`_, find the
@ -23,6 +20,21 @@ Installation Guide

      pip install <url to the wheel>

+.. note:: (EXPERIMENTAL) Pre-built binary package for R: now with GPU support
+
+  If you are planning to use R, consider installing ``{xgboost}`` from a pre-built binary package, to avoid the trouble of building XGBoost from the source. The binary package will let you use the GPU algorithm (``gpu_hist``) out of the box, as long as your machine has NVIDIA GPUs.
+
+  Download the binary package from the Releases page. The file name will be of the form ``xgboost_r_gpu_linux_[version].tar.gz``. Then install XGBoost by running:
+
+  .. code-block:: bash
+
+    # Install dependencies
+    R -q -e "install.packages(c('data.table', 'magrittr', 'jsonlite', 'remotes'))"
+    # Install XGBoost
+    R CMD INSTALL ./xgboost_r_gpu_linux.tar.gz
+
+  Currently, we provide the binary package for 64-bit Linux.
+

 ****************************
 Building XGBoost from source
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -740,15 +740,17 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
 *
 * \param handle Booster handle
 * \param dmat   DMatrix handle
- * \param c_json_config String encoded predict configuration in JSON format.
+ * \param c_json_config String encoded predict configuration in JSON format, with
+ *                      following available fields in the JSON object:
 *
- *    "type": [0, 5]
+ *    "type": [0, 6]
 *      0: normal prediction
 *      1: output margin
 *      2: predict contribution
- *      3: predict approxmated contribution
+ *      3: predict approximated contribution
 *      4: predict feature interaction
- *      5: predict leaf
+ *      5: predict approximated feature interaction
+ *      6: predict leaf
 *    "training": bool
 *      Whether the prediction function is used as part of a training loop.  **Not used
 *      for inplace prediction**.
@ -764,7 +766,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
 *    "iteration_begin": int
 *      Beginning iteration of prediction.
 *    "iteration_end": int
- *      End iteration of prediction.  Set to 0 this will become the size of tree model.
+ *      End iteration of prediction.  Set to 0 this will become the size of tree model (all the trees).
 *    "strict_shape": bool
 *      Whether should we reshape the output with stricter rules.  If set to true,
 *      normal/margin/contrib/interaction predict will output consistent shape
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@ -36,7 +36,8 @@ enum class PredictionType : std::uint8_t {  // NOLINT
  kContribution = 2,
  kApproxContribution = 3,
  kInteraction = 4,
-  kLeaf = 5
+  kApproxInteraction = 5,
+  kLeaf = 6
 };

 /*! \brief entry to to easily hold returning information */
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@ -6,6 +6,6 @@

 #define XGBOOST_VER_MAJOR 1
 #define XGBOOST_VER_MINOR 4
-#define XGBOOST_VER_PATCH 0
+#define XGBOOST_VER_PATCH 2

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@ -84,8 +84,9 @@ if __name__ == "__main__":

    print("building Java wrapper")
    with cd(".."):
-        maybe_makedirs("build")
-        with cd("build"):
+        build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build'
+        maybe_makedirs(build_dir)
+        with cd(build_dir):
            if sys.platform == "win32":
                # Force x64 build on Windows.
                maybe_generator = ' -A x64'
@ -114,6 +115,9 @@ if __name__ == "__main__":
            if gpu_arch_flag is not None:
                args.append("%s" % gpu_arch_flag)

+            lib_dir = os.path.join(os.pardir, 'lib')
+            if os.path.exists(lib_dir):
+                shutil.rmtree(lib_dir)
            run("cmake .. " + " ".join(args) + maybe_generator)
            run("cmake --build . --config Release" + maybe_parallel_build)

--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -6,7 +6,7 @@

    <groupId>ml.dmlc</groupId>
    <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
@ -37,7 +37,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <build>
        <plugins>
            <plugin>
@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-gpu_2.12</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/jvm-packages/xgboost4j-gpu/src
+++ b/jvm-packages/xgboost4j-gpu/src
@ -1 +0,0 @@
-../xgboost4j/src/
--- a/jvm-packages/xgboost4j-gpu/src/main/java
+++ b/jvm-packages/xgboost4j-gpu/src/main/java
@ -0,0 +1 @@
+../../../xgboost4j/src/main/java/
--- a/jvm-packages/xgboost4j-gpu/src/main/resources/xgboost4j-version.properties
+++ b/jvm-packages/xgboost4j-gpu/src/main/resources/xgboost4j-version.properties
@ -0,0 +1 @@
+../../../../xgboost4j/src/main/resources/xgboost4j-version.properties
--- a/jvm-packages/xgboost4j-gpu/src/main/scala
+++ b/jvm-packages/xgboost4j-gpu/src/main/scala
@ -0,0 +1 @@
+../../../xgboost4j/src/main/scala/
--- a/jvm-packages/xgboost4j-gpu/src/native
+++ b/jvm-packages/xgboost4j-gpu/src/native
@ -0,0 +1 @@
+../../xgboost4j/src/native
--- a/jvm-packages/xgboost4j-gpu/src/test
+++ b/jvm-packages/xgboost4j-gpu/src/test
@ -0,0 +1 @@
+../../xgboost4j/src/test
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark-gpu/src
+++ b/jvm-packages/xgboost4j-spark-gpu/src
@ -1 +0,0 @@
-../xgboost4j-spark/src/
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala
@ -0,0 +1 @@
+../../../xgboost4j-spark/src/main/scala
--- a/jvm-packages/xgboost4j-spark-gpu/src/test
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test
@ -0,0 +1 @@
+../../xgboost4j-spark/src/test
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@ -1 +1 @@
-1.4.0-SNAPSHOT
+1.4.2
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -229,6 +229,9 @@ def _numpy2ctypes_type(dtype):


 def _array_interface(data: np.ndarray) -> bytes:
+    assert (
+        data.dtype.hasobject is False
+    ), "Input data contains `object` dtype.  Expecting numeric data."
    interface = data.__array_interface__
    if "mask" in interface:
        interface["mask"] = interface["mask"].__array_interface__
@ -1648,7 +1651,9 @@ class Booster(object):
            prediction. Note the final column is the bias term.

        approx_contribs :
-            Approximate the contributions of each feature
+            Approximate the contributions of each feature.  Used when ``pred_contribs`` or
+            ``pred_interactions`` is set to True.  Changing the default of this parameter
+            (False) is not recommended.

        pred_interactions :
            When this is True the output will be a matrix of size (nsample,
@ -1720,9 +1725,9 @@ class Booster(object):
        if pred_contribs:
            assign_type(2 if not approx_contribs else 3)
        if pred_interactions:
-            assign_type(4)
+            assign_type(4 if not approx_contribs else 5)
        if pred_leaf:
-            assign_type(5)
+            assign_type(6)
        preds = ctypes.POINTER(ctypes.c_float)()
        shape = ctypes.POINTER(c_bst_ulong)()
        dims = c_bst_ulong()
@ -1839,8 +1844,8 @@ class Booster(object):
                )

        if isinstance(data, np.ndarray):
-            from .data import _maybe_np_slice
-            data = _maybe_np_slice(data, data.dtype)
+            from .data import _ensure_np_dtype
+            data, _ = _ensure_np_dtype(data, data.dtype)
            _check_call(
                _LIB.XGBoosterPredictFromDense(
                    self.handle,
@ -1870,7 +1875,9 @@ class Booster(object):
                )
            )
            return _prediction_output(shape, dims, preds, False)
-        if lazy_isinstance(data, "cupy.core.core", "ndarray"):
+        if lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
+            data, "cupy._core.core", "ndarray"
+        ):
            from .data import _transform_cupy_array
            data = _transform_cupy_array(data)
            interface = data.__cuda_array_interface__
@ -2025,7 +2032,7 @@ class Booster(object):
        """
        if isinstance(fout, (STRING_TYPES, os.PathLike)):
            fout = os.fspath(os.path.expanduser(fout))
-            fout = open(fout, 'w')
+            fout = open(fout, 'w')  # pylint: disable=consider-using-with
            need_close = True
        else:
            need_close = False
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@ -21,6 +21,7 @@ from contextlib import contextmanager
 from collections import defaultdict
 from collections.abc import Sequence
 from threading import Thread
+from functools import partial, update_wrapper
 from typing import TYPE_CHECKING, List, Tuple, Callable, Optional, Any, Union, Dict, Set
 from typing import Awaitable, Generator, TypeVar

@ -967,7 +968,7 @@ def _can_output_df(is_df: bool, output_shape: Tuple) -> bool:
    return is_df and len(output_shape) <= 2


-async def _direct_predict_impl(
+async def _direct_predict_impl(  # pylint: disable=too-many-branches
    mapped_predict: Callable,
    booster: "distributed.Future",
    data: _DaskCollection,
@ -1022,6 +1023,14 @@ async def _direct_predict_impl(
                new_axis = list(range(len(output_shape) - 2))
            else:
                new_axis = [i + 2 for i in range(len(output_shape) - 2)]
+        if len(output_shape) == 2:
+            # Somehow dask fail to infer output shape change for 2-dim prediction, and
+            #  `chunks = (None, output_shape[1])` doesn't work due to None is not
+            #  supported in map_blocks.
+            chunks = list(data.chunks)
+            chunks[1] = (output_shape[1], )
+        else:
+            chunks = None
        predictions = da.map_blocks(
            mapped_predict,
            booster,
@ -1029,6 +1038,8 @@ async def _direct_predict_impl(
            False,
            columns,
            base_margin_array,
+
+            chunks=chunks,
            drop_axis=drop_axis,
            new_axis=new_axis,
            dtype=numpy.float32,
@ -1776,20 +1787,20 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        self,
        X: _DaskCollection,
        validate_features: bool,
-        output_margin: bool,
        base_margin: Optional[_DaskCollection],
        iteration_range: Optional[Tuple[int, int]],
    ) -> _DaskCollection:
-        if iteration_range is None:
-            iteration_range = (0, 0)
        predts = await super()._predict_async(
            data=X,
-            output_margin=output_margin,
+            output_margin=self.objective == "multi:softmax",
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range,
        )
-        return _cls_predict_proba(self.objective, predts, da.vstack)
+        vstack = update_wrapper(
+            partial(da.vstack, allow_unknown_chunksizes=True), da.vstack
+        )
+        return _cls_predict_proba(getattr(self, "n_classes_", None), predts, vstack)

    # pylint: disable=missing-function-docstring
    def predict_proba(
@ -1797,7 +1808,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        X: _DaskCollection,
        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
-        output_margin: bool = False,
        base_margin: Optional[_DaskCollection] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> Any:
@ -1808,7 +1818,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
            self._predict_proba_async,
            X=X,
            validate_features=validate_features,
-            output_margin=output_margin,
            base_margin=base_margin,
            iteration_range=iteration_range,
        )
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -104,6 +104,13 @@ def _is_numpy_array(data):
    return isinstance(data, (np.ndarray, np.matrix))


+def _ensure_np_dtype(data, dtype):
+    if data.dtype.hasobject:
+        data = data.astype(np.float32, copy=False)
+        dtype = np.float32
+    return data, dtype
+
+
 def _maybe_np_slice(data, dtype):
    '''Handle numpy slice.  This can be removed if we use __array_interface__.
    '''
@ -118,6 +125,7 @@ def _maybe_np_slice(data, dtype):
            data = np.array(data, copy=False, dtype=dtype)
    except AttributeError:
        data = np.array(data, copy=False, dtype=dtype)
+    data, dtype = _ensure_np_dtype(data, dtype)
    return data


--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -4,7 +4,7 @@
 import copy
 import warnings
 import json
-from typing import Union, Optional, List, Dict, Callable, Tuple, Any
+from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar
 import numpy as np
 from .core import Booster, DMatrix, XGBoostError
 from .core import _deprecate_positional_args, _convert_ntree_limit
@ -561,6 +561,8 @@ class XGBModel(XGBModelBase):
        self._Booster.load_model(fname)
        meta = self._Booster.attr('scikit_learn')
        if meta is None:
+            # FIXME(jiaming): This doesn't have to be a problem as most of the needed
+            # information like num_class and objective is in Learner class.
            warnings.warn(
                'Loading a native XGBoost model with Scikit-Learn interface.')
            return
@ -571,6 +573,8 @@ class XGBModel(XGBModelBase):
                self._le = XGBoostLabelEncoder()
                self._le.from_json(v)
                continue
+            # FIXME(jiaming): This can be removed once label encoder is gone since we can
+            # generate it from `np.arange(self.n_classes_)`
            if k == 'classes_':
                self.classes_ = np.array(v)
                continue
@ -1024,17 +1028,14 @@ class XGBModel(XGBModelBase):
        return np.array(json.loads(b.get_dump(dump_format='json')[0])['bias'])


-def _cls_predict_proba(
-    objective: Union[str, Callable], prediction: Any, vstack: Callable
-) -> Any:
-    if objective == 'multi:softmax':
-        raise ValueError('multi:softmax objective does not support predict_proba,'
-                         ' use `multi:softprob` or `binary:logistic` instead.')
-    if objective == 'multi:softprob' or callable(objective):
-        # Return prediction directly if if objective is defined by user since we don't
-        # know how to perform the transformation
+PredtT = TypeVar("PredtT")
+
+
+def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> PredtT:
+    assert len(prediction.shape) <= 2
+    if len(prediction.shape) == 2 and prediction.shape[1] == n_classes:
        return prediction
-    # Lastly the binary logistic function
+    # binary logistic function
    classone_probs = prediction
    classzero_probs = 1.0 - classone_probs
    return vstack((classzero_probs, classone_probs)).transpose()
@ -1218,8 +1219,10 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            return class_probs

        if len(class_probs.shape) > 1:
+            # turns softprob into softmax
            column_indexes = np.argmax(class_probs, axis=1)
        else:
+            # turns soft logit into class label
            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1

@ -1262,15 +1265,23 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            a numpy array of shape array-like of shape (n_samples, n_classes) with the
            probability of each data example being of a given class.
        """
+        # custom obj:      Do nothing as we don't know what to do.
+        # softprob:        Do nothing, output is proba.
+        # softmax:         Use output margin to remove the argmax in PredTransform.
+        # binary:logistic: Expand the prob vector into 2-class matrix after predict.
+        # binary:logitraw: Unsupported by predict_proba()
        class_probs = super().predict(
            X=X,
-            output_margin=False,
+            output_margin=self.objective == "multi:softmax",
            ntree_limit=ntree_limit,
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range
        )
-        return _cls_predict_proba(self.objective, class_probs, np.vstack)
+        # If model is loaded from a raw booster there's no `n_classes_`
+        return _cls_predict_proba(
+            getattr(self, "n_classes_", None), class_probs, np.vstack
+        )

    def evals_result(self):
        """Return the evaluation results.
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -651,13 +651,17 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
  auto type = PredictionType(get<Integer const>(config["type"]));
  auto iteration_begin = get<Integer const>(config["iteration_begin"]);
  auto iteration_end = get<Integer const>(config["iteration_end"]);
-  learner->Predict(
-      *static_cast<std::shared_ptr<DMatrix> *>(dmat),
-      type == PredictionType::kMargin, &entry.predictions, iteration_begin,
-      iteration_end, get<Boolean const>(config["training"]),
-      type == PredictionType::kLeaf, type == PredictionType::kContribution,
-      type == PredictionType::kApproxContribution,
-      type == PredictionType::kInteraction);
+  bool approximate = type == PredictionType::kApproxContribution ||
+                     type == PredictionType::kApproxInteraction;
+  bool contribs = type == PredictionType::kContribution ||
+                  type == PredictionType::kApproxContribution;
+  bool interactions = type == PredictionType::kInteraction ||
+                      type == PredictionType::kApproxInteraction;
+  bool training = get<Boolean const>(config["training"]);
+  learner->Predict(p_m, type == PredictionType::kMargin, &entry.predictions,
+                   iteration_begin, iteration_end, training,
+                   type == PredictionType::kLeaf, contribs, approximate,
+                   interactions);
  *out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
  auto &shape = learner->GetThreadLocal().prediction_shape;
  auto chunksize = p_m->Info().num_row_ == 0 ? 0 : entry.predictions.Size() / p_m->Info().num_row_;
@ -730,8 +734,9 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr,
  API_BEGIN();
  CHECK_HANDLE();
  std::shared_ptr<xgboost::data::CSRArrayAdapter> x{
-      new xgboost::data::CSRArrayAdapter{
-          StringView{indptr}, StringView{indices}, StringView{data}, cols}};
+      new xgboost::data::CSRArrayAdapter{StringView{indptr},
+                                         StringView{indices}, StringView{data},
+                                         static_cast<size_t>(cols)}};
  std::shared_ptr<DMatrix> p_m {nullptr};
  if (m) {
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@ -56,7 +56,6 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
  }
  case PredictionType::kApproxContribution:
  case PredictionType::kContribution: {
-    auto groups = chunksize / (cols + 1);
    if (groups == 1 && !strict_shape) {
      *out_dim = 2;
      shape.resize(*out_dim);
@ -71,6 +70,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
    }
    break;
  }
+  case PredictionType::kApproxInteraction:
  case PredictionType::kInteraction: {
    if (groups == 1 && !strict_shape) {
      *out_dim = 3;
@ -98,6 +98,10 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
      forest = std::max(static_cast<decltype(forest)>(1), forest);
      shape[3] = forest;
      *out_dim = shape.size();
+    } else if (chunksize == 1) {
+      *out_dim = 1;
+      shape.resize(*out_dim);
+      shape.front() = rows;
    } else {
      *out_dim = 2;
      shape.resize(*out_dim);
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -1290,6 +1290,21 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
                                           num_items, nullptr, false)));
 }

+template <typename InIt, typename OutIt, typename Predicate>
+void CopyIf(InIt in_first, InIt in_second, OutIt out_first, Predicate pred) {
+  // We loop over batches because thrust::copy_if cant deal with sizes > 2^31
+  // See thrust issue #1302, #6822
+  size_t max_copy_size = std::numeric_limits<int>::max() / 2;
+  size_t length = std::distance(in_first, in_second);
+  XGBCachingDeviceAllocator<char> alloc;
+  for (size_t offset = 0; offset < length; offset += max_copy_size) {
+    auto begin_input = in_first + offset;
+    auto end_input = in_first + std::min(offset + max_copy_size, length);
+    out_first = thrust::copy_if(thrust::cuda::par(alloc), begin_input,
+                                end_input, out_first, pred);
+  }
+}
+
 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
 void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
  InclusiveScan(d_in, d_out, cub::Sum(), num_items);
@ -1306,30 +1321,34 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
  TemporaryArray<KeyT> out(keys.size());
  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
                                 out.data().get());
+  TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
-                                     sorted_idx.data());
+                                     sorted_idx_out.data().get());

  if (accending) {
    void *d_temp_storage = nullptr;
-    cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
+    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false);
-    dh::TemporaryArray<char> storage(bytes);
+        sizeof(KeyT) * 8, false, nullptr, false)));
+    TemporaryArray<char> storage(bytes);
    d_temp_storage = storage.data().get();
-    cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
+    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false);
+        sizeof(KeyT) * 8, false, nullptr, false)));
  } else {
    void *d_temp_storage = nullptr;
    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
        sizeof(KeyT) * 8, false, nullptr, false)));
-    dh::TemporaryArray<char> storage(bytes);
+    TemporaryArray<char> storage(bytes);
    d_temp_storage = storage.data().get();
    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
        sizeof(KeyT) * 8, false, nullptr, false)));
  }
+
+  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 }

 namespace detail {
@ -1364,14 +1383,19 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
  size_t bytes = 0;
  Iota(sorted_idx);
  TemporaryArray<std::remove_const_t<U>> values_out(values.size());
+  TemporaryArray<std::remove_const_t<IdxT>> sorted_idx_out(sorted_idx.size());
+
  detail::DeviceSegmentedRadixSortPair<!accending>(
      nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
-      sorted_idx.data(), sorted_idx.size(), n_groups, group_ptr.data(),
+      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
      group_ptr.data() + 1);
-  dh::TemporaryArray<xgboost::common::byte> temp_storage(bytes);
+  TemporaryArray<xgboost::common::byte> temp_storage(bytes);
  detail::DeviceSegmentedRadixSortPair<!accending>(
      temp_storage.data().get(), bytes, values.data(), values_out.data().get(),
-      sorted_idx.data(), sorted_idx.data(), sorted_idx.size(), n_groups,
-      group_ptr.data(), group_ptr.data() + 1);
+      sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(),
+      n_groups, group_ptr.data(), group_ptr.data() + 1);
+
+  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 }
 }  // namespace dh
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -93,6 +93,11 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
                              bst_row_t num_rows, bst_feature_t columns,
                              size_t nnz, int device,
                              size_t num_cuts, bool has_weight) {
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  // device available memory is not accurate when rmm is used.
+  return nnz;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
  if (sketch_batch_num_elements == 0) {
    auto required_memory = RequiredMemory(num_rows, columns, nnz, num_cuts, has_weight);
    // use up to 80% of available space
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -118,9 +118,8 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
  size_t num_valid = column_sizes_scan->back();
  // Copy current subset of valid elements into temporary storage and sort
  sorted_entries->resize(num_valid);
-  dh::XGBCachingDeviceAllocator<char> alloc;
-  thrust::copy_if(thrust::cuda::par(alloc), entry_iter + range.begin(),
-                  entry_iter + range.end(), sorted_entries->begin(), is_valid);
+  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(),
+             sorted_entries->begin(), is_valid);
 }

 void SortByWeight(dh::device_vector<float>* weights,
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@ -229,8 +229,11 @@ class ArrayInterfaceHandler {
      }
      strides[1] = n;
    }
-    auto valid = (rows - 1) * strides[0] + (cols - 1) * strides[1] == (rows * cols) - 1;
-    CHECK(valid) << "Invalid strides in array.";
+
+    auto valid = rows * strides[0] + cols * strides[1] >= (rows * cols);
+    CHECK(valid) << "Invalid strides in array."
+                 << "  strides: (" << strides[0] << "," << strides[1]
+                 << "), shape: (" << rows << ", " << cols << ")";
  }

  static void* ExtractData(std::map<std::string, Json> const &column,
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@ -55,18 +55,9 @@ void CopyDataToDMatrix(AdapterT* adapter, common::Span<Entry> data,
  COOToEntryOp<decltype(batch)> transform_op{batch};
  thrust::transform_iterator<decltype(transform_op), decltype(counting)>
      transform_iter(counting, transform_op);
-  // We loop over batches because thrust::copy_if cant deal with sizes > 2^31
-  // See thrust issue #1302
-  size_t max_copy_size = std::numeric_limits<int>::max() / 2;
  auto begin_output = thrust::device_pointer_cast(data.data());
-  for (size_t offset = 0; offset < batch.Size(); offset += max_copy_size) {
-    auto begin_input = transform_iter + offset;
-    auto end_input =
-        transform_iter + std::min(offset + max_copy_size, batch.Size());
-    begin_output =
-        thrust::copy_if(thrust::cuda::par(alloc), begin_input, end_input,
-                        begin_output, IsValidFunctor(missing));
-  }
+  dh::CopyIf(transform_iter, transform_iter + batch.Size(), begin_output,
+             IsValidFunctor(missing));
 }

 // Does not currently support metainfo as no on-device data source contains this
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -575,6 +575,20 @@ void GPUDartPredictInc(common::Span<float> out_predts,
 }
 #endif

+void GPUDartInplacePredictInc(common::Span<float> out_predts,
+                              common::Span<float> predts, float tree_w,
+                              size_t n_rows, float base_score,
+                              bst_group_t n_groups,
+                              bst_group_t group)
+#if defined(XGBOOST_USE_CUDA)
+;  // NOLINT
+#else
+{
+  common::AssertGPUSupport();
+}
+#endif
+
+
 class Dart : public GBTree {
 public:
  explicit Dart(LearnerModelParam const* booster_config) :
@ -728,13 +742,14 @@ class Dart : public GBTree {
      gpu_predictor_.get()
 #endif  // defined(XGBOOST_USE_CUDA)
    };
+    Predictor const * predictor {nullptr};

    MetaInfo info;
    StringView msg{"Unsupported data type for inplace predict."};
    int32_t device = GenericParameter::kCpuId;
+    PredictionCacheEntry predts;
    // Inplace predict is not used for training, so no need to drop tree.
    for (size_t i = tree_begin; i < tree_end; ++i) {
-      PredictionCacheEntry predts;
      if (tparam_.predictor == PredictorType::kAuto) {
        // Try both predictor implementations
        bool success = false;
@ -742,6 +757,7 @@ class Dart : public GBTree {
          if (p && p->InplacePredict(x, nullptr, model_, missing, &predts, i,
                                     i + 1)) {
            success = true;
+            predictor = p;
 #if defined(XGBOOST_USE_CUDA)
            device = predts.predictions.DeviceIdx();
 #endif  // defined(XGBOOST_USE_CUDA)
@ -750,45 +766,52 @@ class Dart : public GBTree {
        }
        CHECK(success) << msg;
      } else {
-        // No base margin for each tree
-        bool success = this->GetPredictor()->InplacePredict(
-            x, nullptr, model_, missing, &predts, i, i + 1);
+        // No base margin from meta info for each tree
+        predictor = this->GetPredictor().get();
+        bool success = predictor->InplacePredict(x, nullptr, model_, missing,
+                                                 &predts, i, i + 1);
        device = predts.predictions.DeviceIdx();
        CHECK(success) << msg;
      }

      auto w = this->weight_drop_.at(i);
-      auto &h_predts = predts.predictions.HostVector();
-      auto &h_out_predts = out_preds->predictions.HostVector();
+      size_t n_groups = model_.learner_model_param->num_output_group;
+      auto n_rows = predts.predictions.Size() / n_groups;

      if (i == tree_begin) {
-        auto n_rows =
-            h_predts.size() / model_.learner_model_param->num_output_group;
+        // base margin is added here.
        if (p_m) {
          p_m->Info().num_row_ = n_rows;
-          cpu_predictor_->InitOutPredictions(p_m->Info(),
-                                             &out_preds->predictions, model_);
+          predictor->InitOutPredictions(p_m->Info(), &out_preds->predictions,
+                                        model_);
        } else {
          info.num_row_ = n_rows;
-          cpu_predictor_->InitOutPredictions(info, &out_preds->predictions,
-                                             model_);
+          predictor->InitOutPredictions(info, &out_preds->predictions, model_);
        }
      }

      // Multiple the tree weight
-      CHECK_EQ(h_predts.size(), h_out_predts.size());
+      CHECK_EQ(predts.predictions.Size(), out_preds->predictions.Size());
+      auto group = model_.tree_info.at(i);

+      if (device == GenericParameter::kCpuId) {
+        auto &h_predts = predts.predictions.HostVector();
+        auto &h_out_predts = out_preds->predictions.HostVector();
 #pragma omp parallel for
-      for (omp_ulong i = 0; i < h_out_predts.size(); ++i) {
+        for (omp_ulong ridx = 0; ridx < n_rows; ++ridx) {
+          const size_t offset = ridx * n_groups + group;
          // Need to remove the base margin from indiviual tree.
-        h_out_predts[i] +=
-            (h_predts[i] - model_.learner_model_param->base_score) * w;
+          h_out_predts[offset] +=
+              (h_predts[offset] - model_.learner_model_param->base_score) * w;
        }
-    }
-
-    if (device != GenericParameter::kCpuId) {
+      } else {
        out_preds->predictions.SetDevice(device);
-      out_preds->predictions.DeviceSpan();
+        predts.predictions.SetDevice(device);
+        GPUDartInplacePredictInc(out_preds->predictions.DeviceSpan(),
+                                 predts.predictions.DeviceSpan(), w, n_rows,
+                                 model_.learner_model_param->base_score,
+                                 n_groups, group);
+      }
    }
  }

--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@ -14,5 +14,15 @@ void GPUDartPredictInc(common::Span<float> out_predts,
    out_predts[offset] += (predts[offset] * tree_w);
  });
 }
+
+void GPUDartInplacePredictInc(common::Span<float> out_predts,
+                              common::Span<float> predts, float tree_w,
+                              size_t n_rows, float base_score,
+                              bst_group_t n_groups, bst_group_t group) {
+  dh::LaunchN(dh::CurrentDevice(), n_rows, [=] XGBOOST_DEVICE(size_t ridx) {
+    const size_t offset = ridx * n_groups + group;
+    out_predts[offset] += (predts[offset] - base_score) * tree_w;
+  });
+}
 }  // namespace gbm
 }  // namespace xgboost
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@ -87,8 +87,7 @@ std::tuple<float, float, float> BinaryAUC(std::vector<float> const &predts,
 * - Kleiman, Ross and Page, David. $AUC_{\mu}$: A Performance Metric for Multi-Class
 *   Machine Learning Models
 */
-float MultiClassOVR(std::vector<float> const& predts, MetaInfo const& info) {
-  auto n_classes = predts.size() / info.labels_.Size();
+float MultiClassOVR(std::vector<float> const& predts, MetaInfo const& info, size_t n_classes) {
  CHECK_NE(n_classes, 0);
  auto const& labels = info.labels_.ConstHostVector();

@ -230,6 +229,10 @@ class EvalAUC : public Metric {
      info.labels_.SetDevice(tparam_->gpu_id);
      info.weights_.SetDevice(tparam_->gpu_id);
    }
+    //  We use the global size to handle empty dataset.
+    std::array<size_t, 2> meta{info.labels_.Size(), preds.Size()};
+    rabit::Allreduce<rabit::op::Max>(meta.data(), meta.size());
+
    if (!info.group_ptr_.empty()) {
      /**
       * learning to rank
@ -261,16 +264,17 @@ class EvalAUC : public Metric {
        CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                         << ", valid groups: " << valid_groups;
      }
-    } else if (info.labels_.Size() != preds.Size() &&
-               preds.Size() % info.labels_.Size() == 0) {
+    } else if (meta[0] != meta[1] && meta[1] % meta[0] == 0) {
      /**
       * multi class
       */
+      size_t n_classes = meta[1] / meta[0];
+      CHECK_NE(n_classes, 0);
      if (tparam_->gpu_id == GenericParameter::kCpuId) {
-        auc = MultiClassOVR(preds.ConstHostVector(), info);
+        auc = MultiClassOVR(preds.ConstHostVector(), info, n_classes);
      } else {
        auc = GPUMultiClassAUCOVR(preds.ConstDeviceSpan(), info, tparam_->gpu_id,
-                                  &this->d_cache_);
+                                  &this->d_cache_, n_classes);
      }
    } else {
      /**
@ -323,7 +327,8 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
 }

 float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info,
-                          int32_t device, std::shared_ptr<DeviceAUCCache>* cache) {
+                          int32_t device, std::shared_ptr<DeviceAUCCache>* cache,
+                          size_t n_classes) {
  common::AssertGPUSupport();
  return 0;
 }
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@ -61,10 +61,12 @@ struct DeviceAUCCache {
      neg_pos.resize(sorted_idx.size());
      if (is_multi) {
        predts_t.resize(sorted_idx.size());
-        reducer.reset(new dh::AllReducer);
-        reducer->Init(rabit::GetRank());
      }
    }
+    if (is_multi && !reducer) {
+      reducer.reset(new dh::AllReducer);
+      reducer->Init(device);
+    }
  }
 };

@ -197,12 +199,48 @@ XGBOOST_DEVICE size_t LastOf(size_t group, common::Span<Idx> indptr) {
  return indptr[group + 1] - 1;
 }

+
+float ScaleClasses(common::Span<float> results, common::Span<float> local_area,
+                   common::Span<float> fp, common::Span<float> tp,
+                   common::Span<float> auc, std::shared_ptr<DeviceAUCCache> cache,
+                   size_t n_classes) {
+  dh::XGBDeviceAllocator<char> alloc;
+  if (rabit::IsDistributed()) {
+    CHECK_EQ(dh::CudaGetPointerDevice(results.data()), dh::CurrentDevice());
+    cache->reducer->AllReduceSum(results.data(), results.data(), results.size());
+  }
+  auto reduce_in = dh::MakeTransformIterator<thrust::pair<float, float>>(
+      thrust::make_counting_iterator(0), [=] __device__(size_t i) {
+        if (local_area[i] > 0) {
+          return thrust::make_pair(auc[i] / local_area[i] * tp[i], tp[i]);
+        }
+        return thrust::make_pair(std::numeric_limits<float>::quiet_NaN(), 0.0f);
+      });
+
+  float tp_sum;
+  float auc_sum;
+  thrust::tie(auc_sum, tp_sum) = thrust::reduce(
+      thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
+      thrust::make_pair(0.0f, 0.0f),
+      [=] __device__(auto const &l, auto const &r) {
+        return thrust::make_pair(l.first + r.first, l.second + r.second);
+      });
+  if (tp_sum != 0 && !std::isnan(auc_sum)) {
+    auc_sum /= tp_sum;
+  } else {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  return auc_sum;
+}
+
 /**
 * MultiClass implementation is similar to binary classification, except we need to split
 * up each class in all kernels.
 */
 float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info,
-                          int32_t device, std::shared_ptr<DeviceAUCCache>* p_cache) {
+                          int32_t device, std::shared_ptr<DeviceAUCCache>* p_cache,
+                          size_t n_classes) {
+  dh::safe_cuda(cudaSetDevice(device));
  auto& cache = *p_cache;
  if (!cache) {
    cache.reset(new DeviceAUCCache);
@ -213,14 +251,23 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
  auto weights = info.weights_.ConstDeviceSpan();

  size_t n_samples = labels.size();
-  size_t n_classes = predts.size() / labels.size();
-  CHECK_NE(n_classes, 0);
+
+  if (n_samples == 0) {
+    dh::TemporaryArray<float> resutls(n_classes * 4, 0.0f);
+    auto d_results = dh::ToSpan(resutls);
+    dh::LaunchN(device, n_classes * 4, [=]__device__(size_t i) {
+      d_results[i] = 0.0f;
+    });
+    auto local_area = d_results.subspan(0, n_classes);
+    auto fp = d_results.subspan(n_classes, n_classes);
+    auto tp = d_results.subspan(2 * n_classes, n_classes);
+    auto auc = d_results.subspan(3 * n_classes, n_classes);
+    return ScaleClasses(d_results, local_area, fp, tp, auc, cache, n_classes);
+  }

  /**
   * Create sorted index for each class
   */
-  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
-  dh::Iota(d_sorted_idx, device);
  auto d_predts_t = dh::ToSpan(cache->predts_t);
  Transpose(predts, d_predts_t, n_samples, n_classes, device);

@ -231,6 +278,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
  });
  // no out-of-place sort for thrust, cub sort doesn't accept general iterator. So can't
  // use transform iterator in sorting.
+  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
  dh::SegmentedArgSort<false>(d_predts_t, d_class_ptr, d_sorted_idx);

  /**
@ -270,7 +318,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
      });

  // unique values are sparse, so we need a CSR style indptr
-  dh::TemporaryArray<uint32_t> unique_class_ptr(class_ptr.size() + 1);
+  dh::TemporaryArray<uint32_t> unique_class_ptr(class_ptr.size());
  auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
  auto n_uniques = dh::SegmentedUniqueByKey(
      thrust::cuda::par(alloc),
@ -378,32 +426,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
    tp[c] = last.second;
    local_area[c] = last.first * last.second;
  });
-  if (rabit::IsDistributed()) {
-    cache->reducer->AllReduceSum(resutls.data().get(), resutls.data().get(),
-                                 resutls.size());
-  }
-  auto reduce_in = dh::MakeTransformIterator<thrust::pair<float, float>>(
-      thrust::make_counting_iterator(0), [=] __device__(size_t i) {
-        if (local_area[i] > 0) {
-          return thrust::make_pair(auc[i] / local_area[i] * tp[i], tp[i]);
-        }
-        return thrust::make_pair(std::numeric_limits<float>::quiet_NaN(), 0.0f);
-      });
-
-  float tp_sum;
-  float auc_sum;
-  thrust::tie(auc_sum, tp_sum) = thrust::reduce(
-      thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
-      thrust::make_pair(0.0f, 0.0f),
-      [=] __device__(auto const &l, auto const &r) {
-        return thrust::make_pair(l.first + r.first, l.second + r.second);
-      });
-  if (tp_sum != 0 && !std::isnan(auc_sum)) {
-    auc_sum /= tp_sum;
-  } else {
-    return std::numeric_limits<float>::quiet_NaN();
-  }
-  return auc_sum;
+  return ScaleClasses(d_results, local_area, fp, tp, auc, cache, n_classes);
 }

 namespace {
@ -447,10 +470,9 @@ GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
  /**
   * Sort the labels
   */
-  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
  auto d_labels = info.labels_.ConstDeviceSpan();

-  dh::Iota(d_sorted_idx, device);
+  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
  dh::SegmentedArgSort<false>(d_labels, d_group_ptr, d_sorted_idx);

  auto d_weights = info.weights_.ConstDeviceSpan();
--- a/src/metric/auc.h
+++ b/src/metric/auc.h
@ -26,7 +26,8 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
             int32_t device, std::shared_ptr<DeviceAUCCache> *p_cache);

 float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info,
-                          int32_t device, std::shared_ptr<DeviceAUCCache>* cache);
+                          int32_t device, std::shared_ptr<DeviceAUCCache>* cache,
+                          size_t n_classes);

 std::pair<float, uint32_t>
 GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos6
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos6
@ -0,0 +1,112 @@
+ARG CUDA_VERSION_ARG
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos6
+ARG CUDA_VERSION_ARG
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEVTOOLSET_URL_ROOT http://vault.centos.org/6.9/sclo/x86_64/rh/devtoolset-4/
+
+COPY CentOS-Base.repo /etc/yum.repos.d/
+
+# Install all basic requirements
+RUN \
+    yum install -y epel-release && \
+    yum -y update && \
+    yum install -y tar unzip wget xz git patchelf readline-devel libX11-devel libXt-devel \
+                   xorg-x11-server-devel openssl-devel texlive-* && \
+    yum install -y $DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-5.3.1-6.1.el6.x86_64.rpm \
+                   $DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-gfortran-5.3.1-6.1.el6.x86_64.rpm \
+                   $DEVTOOLSET_URL_ROOT/devtoolset-4-libquadmath-devel-5.3.1-6.1.el6.x86_64.rpm \
+                   $DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-c++-5.3.1-6.1.el6.x86_64.rpm \
+                   $DEVTOOLSET_URL_ROOT/devtoolset-4-binutils-2.25.1-8.el6.x86_64.rpm \
+                   $DEVTOOLSET_URL_ROOT/devtoolset-4-runtime-4.1-3.sc1.el6.x86_64.rpm \
+                   $DEVTOOLSET_URL_ROOT/devtoolset-4-libstdc++-devel-5.3.1-6.1.el6.x86_64.rpm
+
+ENV PATH=/opt/python/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/3.3.0/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/3.3.0/lib64:$LD_LIBRARY_PATH
+ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
+ENV F77=/opt/rh/devtoolset-4/root/usr/bin/gfortran
+
+# A few packages have to be built from the source because CentOS 6 is a very old distribution and
+# the system packages are not sufficiently up-to-date to build R 3.3.0. We'll want to update to
+# CentOS 7 after the 1.4.0 release. Tracking issue: dmlc/xgboost#6791.
+#
+# Why choose an old Linux distro? This is so that the resulting xgboost.so is compatible with a
+# wide range of Linux OSes currently in operation. See https://www.python.org/dev/peps/pep-0571/
+RUN \
+    wget https://zlib.net/fossils/zlib-1.2.5.tar.gz  && \
+    wget https://sourceware.org/pub/bzip2/bzip2-1.0.6.tar.gz  && \
+    wget http://tukaani.org/xz/xz-5.2.2.tar.gz  && \
+    wget https://ftp.pcre.org/pub/pcre/pcre-8.40.tar.gz  && \
+    wget https://www.openssl.org/source/old/1.0.0/openssl-1.0.0k.tar.gz  && \
+    wget --no-check-certificate https://curl.se/download/curl-7.47.1.tar.gz  && \
+    tar xf zlib-1.2.5.tar.gz  && \
+    tar xf bzip2-1.0.6.tar.gz  && \
+    tar xf xz-5.2.2.tar.gz  && \
+    tar xf pcre-8.40.tar.gz  && \
+    tar xf openssl-1.0.0k.tar.gz  && \
+    tar xf curl-7.47.1.tar.gz  && \
+    cd zlib-1.2.5  && \
+    ./configure --prefix=/opt/software/packages  && \
+    make -j$(nproc)  && \
+    make install  && \
+    cd ../bzip2-1.0.6  && \
+    sed -i 's/CFLAGS=-Wall/CFLAGS=-fPIC -Wall/g' Makefile  && \
+    make -f Makefile-libbz2_so  && \
+    make clean  && \
+    make -j$(nproc)  && \
+    make -n install PREFIX=/opt/software/packages  && \
+    make install PREFIX=/opt/software/packages  && \
+    cd ../xz-5.2.2  && \
+    ./configure --prefix=/opt/software/packages  && \
+    make -j$(nproc)  && \
+    make install  && \
+    cd ../pcre-8.40  && \
+    ./configure --enable-utf8 --prefix=/opt/software/packages  && \
+    make -j$(nproc)  && \
+    make install  && \
+    cd ../curl-7.47.1  && \
+    ./configure --prefix=/opt/software/packages --with-ssl && \
+    make -j$(nproc)  && \
+    make install  && \
+    export CFLAGS="-I/opt/software/packages/include"   && \
+    export LDFLAGS="-L/opt/software/packages/lib"  && \
+    cd .. && \
+    # R 3.3.0
+    wget -nv -nc https://cran.r-project.org/src/base/R-3/R-3.3.0.tar.gz  && \
+    tar xf R-3.3.0.tar.gz  && \
+    cd R-3.3.0  && \
+    ./configure --prefix=/opt/R/3.3.0 --enable-R-shlib  && \
+    make -j$(nproc)  && \
+    make install  && \
+    # Python
+    wget -nv -nc -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3.sh -b -p /opt/python && \
+    /opt/python/bin/python -m pip install auditwheel && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    # Ninja
+    mkdir -p /usr/local && \
+    cd /usr/local/ && \
+    wget -nv -nc https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz --no-check-certificate && \
+    tar xf v1.10.0.tar.gz && mv ninja-1.10.0 ninja && rm -v v1.10.0.tar.gz && \
+    cd ninja && \
+    /opt/python/bin/python ./configure.py --bootstrap
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
--- a/tests/ci_build/build_r_pkg_with_cuda.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+set -x
+
+if [ "$#" -ne 1 ]
+then
+  echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]"
+  exit 1
+fi
+
+commit_hash="$1"
+
+make Rpack
+mv xgboost/ xgboost_rpack/
+
+mkdir build
+cd build
+cmake .. -GNinja -DUSE_CUDA=ON -DR_LIB=ON
+ninja
+cd ..
+
+rm xgboost
+# This super wacky hack is found in cmake/RPackageInstall.cmake.in and
+# cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
+# and have R use xgboost.so that we've already built.
+rm -v xgboost_rpack/configure
+rm -rfv xgboost_rpack/src
+mkdir -p xgboost_rpack/src
+cp -v lib/xgboost.so xgboost_rpack/src/
+echo 'all:' > xgboost_rpack/src/Makefile
+echo 'all:' > xgboost_rpack/src/Makefile.win
+mv xgboost_rpack/ xgboost/
+tar cvzf xgboost_r_gpu_linux_${commit_hash}.tar.gz xgboost/
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@ -15,6 +15,7 @@ dependencies:
 - matplotlib
 - dask
 - distributed
+- graphviz
 - python-graphviz
 - hypothesis
 - astroid
--- a/tests/ci_build/insert_vcomp140.py
+++ b/tests/ci_build/insert_vcomp140.py
@ -11,7 +11,7 @@ vcomp140_path = 'C:\\Windows\\System32\\vcomp140.dll'

 for wheel_path in sorted(glob.glob(sys.argv[1])):
    m = re.search(r'xgboost-(.*)-py3', wheel_path)
-    assert m
+    assert m, f'wheel_path = {wheel_path}'
    version = m.group(1)

    with zipfile.ZipFile(wheel_path, 'a') as f:
--- a/tests/ci_build/rename_whl.py
+++ b/tests/ci_build/rename_whl.py
@ -26,8 +26,9 @@ dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)
 with cd(dirname):
    tokens = basename.split('-')
    assert len(tokens) == 5
+    version = tokens[1].split('+')[0]
    keywords = {'pkg_name': tokens[0],
-                'version': tokens[1],
+                'version': version,
                'commit_id': commit_id,
                'platform_tag': platform_tag}
    new_name = '{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl'.format(**keywords)
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@ -45,6 +45,10 @@ TEST(HistUtil, DeviceSketch) {
 }

 TEST(HistUtil, SketchBatchNumElements) {
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  LOG(WARNING) << "Test not runnable with RMM enabled.";
+  return;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  size_t constexpr kCols = 10000;
  int device;
  dh::safe_cuda(cudaGetDevice(&device));
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@ -332,27 +332,44 @@ class TestGPUPredict:
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)

-    def test_predict_dart(self):
+    @pytest.mark.parametrize("n_classes", [2, 3])
+    def test_predict_dart(self, n_classes):
+        from sklearn.datasets import make_classification
        import cupy as cp
-        rng = cp.random.RandomState(1994)
        n_samples = 1000
-        X = rng.randn(n_samples, 10)
-        y = rng.randn(n_samples)
+        X_, y_ = make_classification(
+            n_samples=n_samples, n_informative=5, n_classes=n_classes
+        )
+        X, y = cp.array(X_), cp.array(y_)
+
        Xy = xgb.DMatrix(X, y)
-        booster = xgb.train(
-            {
+        if n_classes == 2:
+            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
-            },
-            Xy,
-            num_boost_round=32
-        )
+                "objective": "binary:logistic"
+            }
+        else:
+            params = {
+                "tree_method": "gpu_hist",
+                "booster": "dart",
+                "rate_drop": 0.5,
+                "objective": "multi:softprob",
+                "num_class": n_classes
+            }
+
+        booster = xgb.train(params, Xy, num_boost_round=32)
        # predictor=auto
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)
+        cpu_inplace = booster.inplace_predict(X_)
+        booster.set_param({"predictor": "cpu_predictor"})
+        cpu_copied = booster.predict(Xy)

        copied = cp.array(copied)
+        cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6)
+        cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

        booster.set_param({"predictor": "gpu_predictor"})
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@ -173,13 +173,13 @@ def run_gpu_hist(
    assert tm.non_increasing(history["train"][dataset.metric])


+@pytest.mark.skipif(**tm.no_cudf())
 def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
-    import cudf
    from sklearn.datasets import load_breast_cancer
    with Client(local_cuda_cluster) as client:
        X_, y_ = load_breast_cancer(return_X_y=True)
-        X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
-        y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
+        X = dd.from_array(X_, chunksize=100)
+        y = dd.from_array(y_, chunksize=100)
        run_boost_from_prediction(X, y, "gpu_hist", client)


@ -202,6 +202,7 @@ class TestDistributedGPU:
    @settings(deadline=duration(seconds=120), suppress_health_check=suppress)
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
+    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize(
        "local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
    )
@ -276,7 +277,7 @@ class TestDistributedGPU:
            X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
            y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
            w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
-            run_dask_classifier(X, y, w, model, client)
+            run_dask_classifier(X, y, w, model, "gpu_hist", client, 10)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
@ -454,6 +455,7 @@ async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainRetur

@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda())
+@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
 def test_with_asyncio(local_cuda_cluster: LocalCUDACluster) -> None:
    with Client(local_cuda_cluster) as client:
--- a/tests/python/test_linear.py
+++ b/tests/python/test_linear.py
@ -57,15 +57,13 @@ class TestLinear:
        param['updater'] = 'shotgun'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
-        # shotgun is non-deterministic, so we relax the test by sampling
-        # result.
+        # shotgun is non-deterministic, so we relax the test by only using first and last
+        # iteration.
        if len(result) > 2:
-            sampled_result = [score for i, score in enumerate(result)
-                              if i % 2 == 0]
-            sampled_result[-1] = result[-1]  # make sure the last one is used
+            sampled_result = (result[0], result[-1])
        else:
            sampled_result = result
-        assert tm.non_increasing(sampled_result, 1e-3)
+        assert tm.non_increasing(sampled_result)

    @given(parameter_strategy, strategies.integers(10, 50),
           tm.dataset_strategy, strategies.floats(1e-5, 2.0),
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@ -75,6 +75,11 @@ def run_predict_leaf(predictor):
    first = sliced[0, ...]

    assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
+
+    # When there's only 1 tree, the output is a 1 dim vector
+    booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
+    assert booster.predict(m, pred_leaf=True).shape == (rows, )
+
    return leaf


@ -98,6 +103,27 @@ def test_predict_shape():
    assert len(contrib.shape) == 3
    assert contrib.shape[1] == 1

+    contrib = reg.get_booster().predict(
+        xgb.DMatrix(X), pred_contribs=True, approx_contribs=True
+    )
+    assert len(contrib.shape) == 2
+    assert contrib.shape[1] == X.shape[1] + 1
+
+    interaction = reg.get_booster().predict(
+        xgb.DMatrix(X), pred_interactions=True, approx_contribs=True
+    )
+    assert len(interaction.shape) == 3
+    assert interaction.shape[1] == X.shape[1] + 1
+    assert interaction.shape[2] == X.shape[1] + 1
+
+    interaction = reg.get_booster().predict(
+        xgb.DMatrix(X), pred_interactions=True, approx_contribs=True, strict_shape=True
+    )
+    assert len(interaction.shape) == 4
+    assert interaction.shape[1] == 1
+    assert interaction.shape[2] == X.shape[1] + 1
+    assert interaction.shape[3] == X.shape[1] + 1
+

 class TestInplacePredict:
    '''Tests for running inplace prediction'''
@ -129,6 +155,14 @@ class TestInplacePredict:
        predt_from_array = booster.inplace_predict(X[:10, ...], missing=self.missing)
        predt_from_dmatrix = booster.predict(test)

+        X_obj = X.copy().astype(object)
+
+        assert X_obj.dtype.hasobject is True
+        assert X.dtype.hasobject is False
+        np.testing.assert_allclose(
+            booster.inplace_predict(X_obj), booster.inplace_predict(X)
+        )
+
        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        predt_from_array = booster.inplace_predict(
@ -166,8 +200,13 @@ class TestInplacePredict:
        arr_predt = booster.inplace_predict(X)
        dmat_predt = booster.predict(xgb.DMatrix(X))

+        X = df.values
+        X = np.asfortranarray(X)
+        fort_predt = booster.inplace_predict(X)
+
        np.testing.assert_allclose(dmat_predt, arr_predt)
        np.testing.assert_allclose(df_predt, arr_predt)
+        np.testing.assert_allclose(fort_predt, arr_predt)

    def test_base_margin(self):
        booster = self.booster
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@ -317,15 +317,19 @@ def run_dask_classifier(
    y: xgb.dask._DaskCollection,
    w: xgb.dask._DaskCollection,
    model: str,
+    tree_method: Optional[str],
    client: "Client",
+    n_classes,
 ) -> None:
+    metric = "merror" if n_classes > 2 else "logloss"
+
    if model == "boosting":
        classifier = xgb.dask.DaskXGBClassifier(
-            verbosity=1, n_estimators=2, eval_metric="merror"
+            verbosity=1, n_estimators=2, eval_metric=metric, tree_method=tree_method
        )
    else:
        classifier = xgb.dask.DaskXGBRFClassifier(
-            verbosity=1, n_estimators=2, eval_metric="merror"
+            verbosity=1, n_estimators=2, eval_metric=metric, tree_method=tree_method
        )

    assert classifier._estimator_type == "classifier"
@ -343,7 +347,7 @@ def run_dask_classifier(
    assert isinstance(history, dict)

    assert list(history.keys())[0] == "validation_0"
-    assert list(history["validation_0"].keys())[0] == "merror"
+    assert list(history["validation_0"].keys())[0] == metric
    assert len(list(history["validation_0"])) == 1
    forest = int(
        json.loads(classifier.get_booster().save_config())["learner"][
@ -351,19 +355,20 @@ def run_dask_classifier(
        ]["gbtree_train_param"]["num_parallel_tree"]
    )
    if model == "boosting":
-        assert len(history["validation_0"]["merror"]) == 2
+        assert len(history["validation_0"][metric]) == 2
        assert forest == 1
    else:
-        assert len(history["validation_0"]["merror"]) == 1
+        assert len(history["validation_0"][metric]) == 1
        assert forest == 2

    # Test .predict_proba()
    probas = classifier.predict_proba(X).compute()
-    assert classifier.n_classes_ == 10
+    assert classifier.n_classes_ == n_classes
    assert probas.ndim == 2
    assert probas.shape[0] == kRows
-    assert probas.shape[1] == 10
+    assert probas.shape[1] == n_classes

+    if n_classes > 2:
        cls_booster = classifier.get_booster()
        single_node_proba = cls_booster.inplace_predict(X.compute())

@ -375,10 +380,10 @@ def run_dask_classifier(
            cupy.testing.assert_allclose(single_node_proba, probas)

    # Test with dataframe, not shared with GPU as cupy doesn't work well with da.unique.
-    if isinstance(X, da.Array):
+    if isinstance(X, da.Array) and n_classes > 2:
        X_d: dd.DataFrame = X.to_dask_dataframe()

-        assert classifier.n_classes_ == 10
+        assert classifier.n_classes_ == n_classes
        prediction_df = classifier.predict(X_d).compute()

        assert prediction_df.ndim == 1
@ -393,7 +398,12 @@ def run_dask_classifier(
 def test_dask_classifier(model: str, client: "Client") -> None:
    X, y, w = generate_array(with_weights=True)
    y = (y * 10).astype(np.int32)
-    run_dask_classifier(X, y, w, model, client)
+    run_dask_classifier(X, y, w, model, None, client, 10)
+
+    y_bin = y.copy()
+    y_bin[y > 5] = 1.0
+    y_bin[y <= 5] = 0.0
+    run_dask_classifier(X, y_bin, w, model, None, client, 2)


@pytest.mark.skipif(**tm.no_sklearn())
@ -559,22 +569,26 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
    # multiclass
    X_, y_ = make_classification(
        n_samples=n_samples,
-        n_classes=10,
+        n_classes=n_workers,
        n_informative=n_features,
        n_redundant=0,
        n_repeated=0
    )
+    for i in range(y_.shape[0]):
+        y_[i] = i % n_workers
    X = dd.from_array(X_, chunksize=10)
    y = dd.from_array(y_, chunksize=10)

    n_samples = n_workers - 1
    valid_X_, valid_y_ = make_classification(
        n_samples=n_samples,
-        n_classes=10,
+        n_classes=n_workers,
        n_informative=n_features,
        n_redundant=0,
        n_repeated=0
    )
+    for i in range(valid_y_.shape[0]):
+        valid_y_[i] = i % n_workers
    valid_X = dd.from_array(valid_X_, chunksize=n_samples)
    valid_y = dd.from_array(valid_y_, chunksize=n_samples)

@ -585,9 +599,9 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->


 def test_empty_dmatrix_auc() -> None:
-    with LocalCluster(n_workers=2) as cluster:
+    with LocalCluster(n_workers=8) as cluster:
        with Client(cluster) as client:
-            run_empty_dmatrix_auc(client, "hist", 2)
+            run_empty_dmatrix_auc(client, "hist", 8)


 def run_auc(client: "Client", tree_method: str) -> None:
@ -1014,6 +1028,16 @@ class TestWithDask:
                                 evals=[(m, 'train')])['history']
        note(history)
        history = history['train'][dataset.metric]
+
+        def is_stump():
+            return params["max_depth"] == 1 or params["max_leaves"] == 1
+
+        def minimum_bin():
+            return "max_bin" in params and params["max_bin"] == 2
+
+        if minimum_bin() and is_stump():
+            assert tm.non_increasing(history, tolerance=1e-3)
+        else:
            assert tm.non_increasing(history)
        # Make sure that it's decreasing
        assert history[-1] < history[0]
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@ -272,6 +272,8 @@ def eval_error_metric(predt, dtrain: xgb.DMatrix):
    label = dtrain.get_label()
    r = np.zeros(predt.shape)
    gt = predt > 0.5
+    if predt.size == 0:
+        return "CustomErr", 0
    r[gt] = 1 - label[gt]
    le = predt <= 0.5
    r[le] = label[le]
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@ -1,7 +1,5 @@
 #!/bin/bash

-make -f dmlc-core/scripts/packages.mk lz4
-
 source $HOME/miniconda/bin/activate

 if [ ${TASK} == "python_sdist_test" ]; then
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@ -1,5 +1,9 @@
 #!/bin/bash

+# https://travis-ci.community/t/macos-build-fails-because-of-homebrew-bundle-unknown-command/7296/27
+brew install cmake libomp lz4
+
+
 if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_sdist_test" ]; then
    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
        wget --no-verbose -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
Author	SHA1	Message	Date
Jiaming Yuan	522b8977c2	Make 1.4.2 patch release. (#6962 )	2021-05-13 16:17:14 +08:00
Jiaming Yuan	8147d78b12	[back port] Fix multiclass auc with empty dataset. (#6947 ) (#6960 )	2021-05-13 12:31:52 +08:00
Jiaming Yuan	651c4ac03b	[CI] Fix CI/CD pipeline broken by latest auditwheel (4.0.0) (#6951 ) (#6952 ) Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2021-05-11 20:45:04 +08:00
Philip Hyunsu Cho	1fb75d2460	Make it easier to release GPU/CPU code artifacts to Maven Central (#6907 )	2021-05-04 12:43:42 -07:00
Jiaming Yuan	6609211517	[backport] Enforce input data is not `object`. (#6927 ) (#6938 ) * Check for object data type. * Allow strided arrays with greater underlying buffer size.	2021-05-04 16:10:16 +08:00
Jiaming Yuan	b78ad1e623	[backport] Ensure predict leaf output 1-dim vector where there's only 1 tree. (#6889 ) (#6937 )	2021-05-04 16:09:49 +08:00
Jiaming Yuan	96f8843694	[backport] CI fixes (#6933 ) * Relax shotgun test. (#6900) It's non-deterministic algorithm, the test is flaky. * Disable pylint error. (#6911) * [CI] Skip external memory gtest on osx. (#6901) * [CI] Fix custom metric test with empty dataset. (#6917) * Reduce Travis environment setup time. (#6912) * Remove unused r from travis. * Don't update homebrew. * Don't install indirect/unused dependencies like libgit2, wget, openssl. * Move graphviz installation to conda. * Relax shotgun test. (#6918) * Relax test for decision stump in distributed environment. (#6919) * Backport cupy fix.	2021-05-03 21:30:26 +08:00
Jiaming Yuan	a6d1fbf8d1	Fix warning on Windows. (#6883 )	2021-04-22 02:57:23 +08:00
Jiaming Yuan	a347ef7c66	Release 1.4.1 (#6874 )	2021-04-20 08:31:38 +08:00
Jiaming Yuan	d05c47dcf8	[back port] Copy output data for argsort. (#6866 ) (#6870 ) Fix GPU AUC.	2021-04-17 01:33:22 +08:00
Jiaming Yuan	9f5e2c52ce	[back port] Update dmlc-core. (#6862 ) (#6864 ) * Install pandoc, pandoc-citeproc on CI.	2021-04-16 03:49:41 +08:00
Hyunsu Cho	c42387162d	Upload Doxygen to correct destination	2021-04-13 15:07:29 -07:00
Hyunsu Cho	1220024442	Release 1.4.0	2021-04-10 17:42:00 -07:00
Philip Hyunsu Cho	964ee6b605	[CI] Pack R package tarball with pre-built xgboost.so (with GPU support) (#6827 ) (#6836 ) * Add scripts for packaging R package with GPU-enabled libxgboost.so * [CI] Automatically build R package tarball * Add comments * Don't build tarball for pull requests * Update the installation doc	2021-04-07 22:47:54 -07:00
Jiaming Yuan	04fedefd4d	[back port] Use batched copy if. (#6826 ) (#6834 )	2021-04-07 04:50:52 +08:00
Jiaming Yuan	f814d4027a	[back port] Remove unnecessary calls to iota. (#6797 ) (#6833 )	2021-04-07 04:47:29 +08:00
Jiaming Yuan	2cc37370e2	[back port] Fix approximated predict contribution. (#6811 ) (#6832 )	2021-04-07 04:47:07 +08:00
Jiaming Yuan	c6a0bdbb5a	[back port] More general predict proba. (#6817 ) (#6831 ) * Use `output_margin` for `softmax`. * Add test for dask binary cls. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2021-04-07 04:46:11 +08:00
Jiaming Yuan	357a78b3de	[back port] Optimize dart inplace predict perf. (#6804 ) (#6829 )	2021-04-07 00:21:12 +08:00
Jiaming Yuan	d231e7c35f	[back port] Don't estimate sketch batch size when rmm is used. (#6807 ) (#6830 )	2021-04-07 00:16:39 +08:00
Jiaming Yuan	604ae01b7a	[back port] Use CPU input for test_boost_from_prediction. (#6818 ) (#6824 )	2021-04-05 18:32:04 +08:00
Hyunsu Cho	43f52ed33c	Release 1.4.0 RC1	2021-03-28 01:10:20 +00:00
				`@ -0,0 +1 @@`
				`../../../../xgboost4j/src/main/resources/xgboost4j-version.properties`