Bump version to 1.7.6 (#9305 )

[backport] Optimize prediction with QuantileDMatrix. (#9096 ) (#9303 )
[backport] Fix monotone constraints on CPU. (#9122 ) (#9287 )
2023-06-16 03:33:16 +08:00 · 2023-06-15 23:32:03 +08:00 · 2023-06-11 17:51:25 +08:00 · 2023-06-11 13:22:23 +08:00 · 2023-06-11 13:18:23 +08:00 · 2023-06-11 11:08:45 +08:00
143 changed files with 2675 additions and 1711 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -75,19 +75,18 @@ jobs:
    - uses: actions/checkout@v2
      with:
        submodules: 'true'
-    - name: Install system packages
-      run: |
-        sudo apt-get install -y --no-install-recommends ninja-build
-    - uses: conda-incubator/setup-miniconda@v2
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
      with:
-        auto-update-conda: true
-        python-version: ${{ matrix.python-version }}
-        activate-environment: test
+        cache-downloads: true
+        cache-env: true
+        environment-name: cpp_test
+        environment-file: tests/ci_build/conda_env/cpp_test.yml
    - name: Display Conda env
      shell: bash -l {0}
      run: |
        conda info
        conda list
+
    - name: Build and install XGBoost static library
      shell: bash -l {0}
      run: |
@@ -109,6 +108,7 @@ jobs:
        cd ..
        rm -rf ./build
        popd
+
    - name: Build and install XGBoost shared library
      shell: bash -l {0}
      run: |
@@ -148,66 +148,13 @@ jobs:
      run: |
        LINT_LANG=cpp make lint

-  doxygen:
-    runs-on: ubuntu-latest
-    name: Generate C/C++ API doc using Doxygen
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'true'
-    - uses: actions/setup-python@v2
-      with:
-        python-version: "3.8"
-        architecture: 'x64'
-    - name: Install system packages
-      run: |
-        sudo apt-get install -y --no-install-recommends doxygen graphviz ninja-build
-        python -m pip install wheel setuptools
-        python -m pip install awscli
-    - name: Run Doxygen
-      run: |
-        mkdir build
-        cd build
-        cmake .. -DBUILD_C_DOC=ON -GNinja
-        ninja -v doc_doxygen
-    - name: Extract branch name
-      shell: bash
-      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
-      id: extract_branch
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-    - name: Publish
-      run: |
-        cd build/
-        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
-        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
-
-  sphinx:
-    runs-on: ubuntu-latest
-    name: Build docs using Sphinx
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'true'
-    - uses: actions/setup-python@v2
-      with:
-        python-version: "3.8"
-        architecture: 'x64'
-    - name: Install system packages
-      run: |
-        sudo apt-get install -y --no-install-recommends graphviz
-        python -m pip install wheel setuptools
-        python -m pip install -r doc/requirements.txt
-    - name: Extract branch name
-      shell: bash
-      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
-      id: extract_branch
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-    - name: Run Sphinx
-      run: |
-        make -C doc html
-      env:
-        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
+        python3 dmlc-core/scripts/lint.py --exclude_path \
+            python-package/xgboost/dmlc-core \
+            python-package/xgboost/include \
+            python-package/xgboost/lib \
+            python-package/xgboost/rabit \
+            python-package/xgboost/src \
+            --pylint-rc python-package/.pylintrc \
+            xgboost \
+            cpp \
+            include src python-package
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -41,12 +41,46 @@ jobs:
      run: |
        python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1

-  python-sdist-test:
+  python-sdist-test-on-Linux:
+    # Mismatched glibcxx version between system and conda forge.
    runs-on: ${{ matrix.os }}
    name: Test installing XGBoost Python source package on ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-11, windows-latest]
+        os: [ubuntu-latest]
+    steps:
+    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+      with:
+        submodules: 'true'
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: false
+        environment-name: sdist_test
+        environment-file: tests/ci_build/conda_env/sdist_test.yml
+    - name: Display Conda env
+      shell: bash -l {0}
+      run: |
+        conda info
+        conda list
+    - name: Build and install XGBoost
+      shell: bash -l {0}
+      run: |
+        cd python-package
+        python --version
+        python setup.py sdist
+        pip install -v ./dist/xgboost-*.tar.gz
+        cd ..
+        python -c 'import xgboost'
+
+  python-sdist-test:
+    # Use system toolchain instead of conda toolchain for macos and windows.
+    # MacOS has linker error if clang++ from conda-forge is used
+    runs-on: ${{ matrix.os }}
+    name: Test installing XGBoost Python source package on ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [macos-11, windows-latest]
        python-version: ["3.8"]
    steps:
    - uses: actions/checkout@v2
@@ -56,11 +90,7 @@ jobs:
      if: matrix.os == 'macos-11'
      run: |
        brew install ninja libomp
-    - name: Install Ubuntu system dependencies
-      if: matrix.os == 'ubuntu-latest'
-      run: |
-        sudo apt-get install -y --no-install-recommends ninja-build
-    - uses: conda-incubator/setup-miniconda@v2
+    - uses: conda-incubator/setup-miniconda@35d1405e78aa3f784fe3ce9a2eb378d5eeb62169 # v2.1.1
      with:
        auto-update-conda: true
        python-version: ${{ matrix.python-version }}
@@ -80,6 +110,58 @@ jobs:
        cd ..
        python -c 'import xgboost'

+  python-tests-on-macos:
+    name: Test XGBoost Python package on ${{ matrix.config.os }}
+    runs-on: ${{ matrix.config.os }}
+    timeout-minutes: 60
+    strategy:
+      matrix:
+        config:
+          - {os: macos-11}
+
+    steps:
+    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+      with:
+        submodules: 'true'
+
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: false
+        environment-name: macos_test
+        environment-file: tests/ci_build/conda_env/macos_cpu_test.yml
+
+    - name: Display Conda env
+      shell: bash -l {0}
+      run: |
+        conda info
+        conda list
+
+    - name: Build XGBoost on macos
+      shell: bash -l {0}
+      run: |
+        brew install ninja
+
+        mkdir build
+        cd build
+        # Set prefix, to use OpenMP library from Conda env
+        # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228
+        # to learn why we don't use libomp from Homebrew.
+        cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        ninja
+
+    - name: Install Python package
+      shell: bash -l {0}
+      run: |
+        cd python-package
+        python --version
+        python setup.py install
+
+    - name: Test Python package
+      shell: bash -l {0}
+      run: |
+        pytest -s -v -rxXs --durations=0 ./tests/python
+
  python-tests-on-win:
    name: Test XGBoost Python package on ${{ matrix.config.os }}
    runs-on: ${{ matrix.config.os }}
@@ -125,56 +207,4 @@ jobs:
    - name: Test Python package
      shell: bash -l {0}
      run: |
-        pytest -s -v ./tests/python
-
-  python-tests-on-macos:
-    name: Test XGBoost Python package on ${{ matrix.config.os }}
-    runs-on: ${{ matrix.config.os }}
-    timeout-minutes: 90
-    strategy:
-      matrix:
-        config:
-          - {os: macos-11, python-version "3.8" }
-
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'true'
-
-    - uses: conda-incubator/setup-miniconda@v2
-      with:
-        auto-update-conda: true
-        python-version: ${{ matrix.config.python-version }}
-        activate-environment: macos_test
-        environment-file: tests/ci_build/conda_env/macos_cpu_test.yml
-
-    - name: Display Conda env
-      shell: bash -l {0}
-      run: |
-        conda info
-        conda list
-
-    - name: Build XGBoost on macos
-      shell: bash -l {0}
-      run: |
-        brew install ninja
-
-        mkdir build
-        cd build
-        # Set prefix, to use OpenMP library from Conda env
-        # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228
-        # to learn why we don't use libomp from Homebrew.
-        cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
-        ninja
-
-    - name: Install Python package
-      shell: bash -l {0}
-      run: |
-        cd python-package
-        python --version
-        python setup.py install
-
-    - name: Test Python package
-      shell: bash -l {0}
-      run: |
-        pytest -s -v ./tests/python
+        pytest -s -v -rxXs --durations=0 ./tests/python
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -5,6 +5,7 @@ on: [push, pull_request]
 env:
  R_PACKAGES: c('XML', 'data.table', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
  GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+  _R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_: 2.5

 permissions:
  contents: read # to fetch code (actions/checkout)
@@ -68,6 +69,7 @@ jobs:
          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'cmake'}
    env:
      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      _R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_: 2.5
      RSPM: ${{ matrix.config.rspm }}

    steps:
@@ -121,6 +123,10 @@ jobs:
        config:
          - {r: 'release'}

+    env:
+      _R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_: 2.5
+      MAKE: "make -j$(nproc)"
+
    steps:
    - uses: actions/checkout@v2
      with:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 1.7.1)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(xgboost LANGUAGES CXX C VERSION 1.7.6)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
@@ -168,9 +168,6 @@ find_package(Threads REQUIRED)

 if (USE_OPENMP)
  if (APPLE)
-    # Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
-    # OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
-    cmake_minimum_required(VERSION 3.16)
    find_package(OpenMP)
    if (NOT OpenMP_FOUND)
      # Try again with extra path info; required for libomp 15+ from Homebrew
--- a/1
+++ b/1
@@ -126,7 +126,6 @@ Rpack: clean_all
 	cat R-package/src/Makevars.in|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.in
 	cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
 	rm -f xgboost/src/Makevars.win-e   # OSX sed create this extra file; remove it
-	rm -f xgboost/cleanup
 	bash R-package/remove_warning_suppression_pragma.sh
 	bash xgboost/remove_warning_suppression_pragma.sh
 	rm xgboost/remove_warning_suppression_pragma.sh
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@@ -31,7 +31,7 @@ if (USE_OPENMP)
 endif (USE_OPENMP)
 set_target_properties(
  xgboost-r PROPERTIES
-  CXX_STANDARD 14
+  CXX_STANDARD 17
  CXX_STANDARD_REQUIRED ON
  POSITION_INDEPENDENT_CODE ON)

--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.7.1.1
-Date: 2022-11-03
+Version: 1.7.6.1
+Date: 2023-06-16
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
         email = "tianqi.tchen@gmail.com"),
@@ -66,5 +66,6 @@ Imports:
    methods,
    data.table (>= 1.9.6),
    jsonlite (>= 1.0),
-RoxygenNote: 7.1.1
-SystemRequirements: GNU make, C++14
+RoxygenNote: 7.2.3
+Encoding: UTF-8
+SystemRequirements: GNU make, C++17
--- a/R-package/LICENSE
+++ b/R-package/LICENSE
@@ -1,9 +1,9 @@
-Copyright (c) 2014 by Tianqi Chen and Contributors 
+Copyright (c) 2014-2023, Tianqi Chen and XBGoost Contributors

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-    
+
   http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -544,9 +544,11 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' @return
 #' Results are stored in the \code{coefs} element of the closure.
-#' The \code{\link{xgb.gblinear.history}} convenience function provides an easy way to access it.
+#' The \code{\link{xgb.gblinear.history}} convenience function provides an easy
+#' way to access it.
 #' With \code{xgb.train}, it is either a dense of a sparse matrix.
-#' While with \code{xgb.cv}, it is a list (an element per each fold) of such matrices.
+#' While with \code{xgb.cv}, it is a list (an element per each fold) of such
+#' matrices.
 #'
 #' @seealso
 #' \code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
@@ -558,7 +560,7 @@ cb.cv.predict <- function(save_models = FALSE) {
 #' # without considering the 2nd order interactions:
 #' x <- model.matrix(Species ~ .^2, iris)[,-1]
 #' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"))
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2)
 #' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
 #'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
 #' # For 'shotgun', which is a default linear updater, using high eta values may result in
@@ -583,14 +585,14 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' # For xgb.cv:
 #' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-#'              callbacks = list(cb.gblinear.history()))
+#'               callbacks = list(cb.gblinear.history()))
 #' # coefficients in the CV fold #3
 #' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
 #'
 #'
 #' #### Multiclass classification:
 #' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1)
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 2)
 #' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
 #'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
 #' # For the default linear updater 'shotgun' it sometimes is helpful
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -328,8 +328,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
                                predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
                                reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
  object <- xgb.Booster.complete(object, saveraw = FALSE)
+
  if (!inherits(newdata, "xgb.DMatrix"))
-    newdata <- xgb.DMatrix(newdata, missing = missing)
+    newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
  if (!is.null(object[["feature_names"]]) &&
      !is.null(colnames(newdata)) &&
      !identical(object[["feature_names"]], colnames(newdata)))
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -18,7 +18,7 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@@ -110,7 +110,7 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL, nth
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
 #'
 #' stopifnot(nrow(dtrain) == nrow(train$data))
 #' stopifnot(ncol(dtrain) == ncol(train$data))
@@ -138,7 +138,7 @@ dim.xgb.DMatrix <- function(x) {
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
 #' dimnames(dtrain)
 #' colnames(dtrain)
 #' colnames(dtrain) <- make.names(1:ncol(train$data))
@@ -193,7 +193,7 @@ dimnames.xgb.DMatrix <- function(x) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #'
 #' labels <- getinfo(dtrain, 'label')
 #' setinfo(dtrain, 'label', 1-labels)
@@ -249,7 +249,7 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #'
 #' labels <- getinfo(dtrain, 'label')
 #' setinfo(dtrain, 'label', 1-labels)
@@ -345,7 +345,7 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #'
 #' dsub <- slice(dtrain, 1:42)
 #' labels1 <- getinfo(dsub, 'label')
@@ -401,7 +401,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #'
 #' dtrain
 #' print(dtrain, verbose=TRUE)
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -7,7 +7,7 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -48,8 +48,8 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
-#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
 #'
 #' param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
 #' nrounds = 4
@@ -65,8 +65,12 @@
 #' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
 #'
 #' # learning with new features
-#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
-#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+#' new.dtrain <- xgb.DMatrix(
+#'   data = new.features.train, label = agaricus.train$label, nthread = 2
+#' )
+#' new.dtest <- xgb.DMatrix(
+#'   data = new.features.test, label = agaricus.test$label, nthread = 2
+#' )
 #' watchlist <- list(train = new.dtrain)
 #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 #'
@@ -79,7 +83,7 @@
 #'           accuracy.after, "!\n"))
 #'
 #' @export
-xgb.create.features <- function(model, data, ...){
+xgb.create.features <- function(model, data, ...) {
  check.deprecation(...)
  pred_with_leaf <- predict(model, data, predleaf = TRUE)
  cols <- lapply(as.data.frame(pred_with_leaf), factor)
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -110,9 +110,9 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
-#'                   max_depth = 3, eta = 1, objective = "binary:logistic")
+#'              max_depth = 3, eta = 1, objective = "binary:logistic")
 #' print(cv)
 #' print(cv, verbose=TRUE)
 #'
@@ -192,7 +192,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =

  # create the booster-folds
  # train_folds
-  dall <- xgb.get.DMatrix(data, label, missing)
+  dall <- xgb.get.DMatrix(data, label, missing, nthread = params$nthread)
  bst_folds <- lapply(seq_along(folds), function(k) {
    dtest  <- slice(dall, folds[[k]])
    # code originally contributed by @RolandASc on stackoverflow
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -34,7 +34,7 @@
 #' The branches that also used for missing values are marked as bold
 #' (as in "carrying extra capacity").
 #'
-#' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
+#' This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
 #'
 #' @return
 #'
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -192,8 +192,8 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
-#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label))
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
 #' watchlist <- list(train = dtrain, eval = dtest)
 #'
 #' ## A simple xgb.train example:
--- a/R-package/configure
+++ b/R-package/configure
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -2,10 +2,25 @@

 AC_PREREQ(2.69)

-AC_INIT([xgboost],[1.7.1],[],[xgboost],[])
+AC_INIT([xgboost],[1.7.6],[],[xgboost],[])

-# Use this line to set CC variable to a C compiler
-AC_PROG_CC
+: ${R_HOME=`R RHOME`}
+if test -z "${R_HOME}"; then
+  echo "could not determine R_HOME"
+  exit 1
+fi
+
+CXX17=`"${R_HOME}/bin/R" CMD config CXX17`
+CXX17STD=`"${R_HOME}/bin/R" CMD config CXX17STD`
+CXX="${CXX17} ${CXX17STD}"
+CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
+
+CC=`"${R_HOME}/bin/R" CMD config CC`
+CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
+CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
+
+LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS`
+AC_LANG(C++)

 ### Check whether backtrace() is part of libc or the external lib libexecinfo
 AC_MSG_CHECKING([Backtrace lib])
@@ -40,7 +55,7 @@ then
  ac_pkg_openmp=no
  AC_MSG_CHECKING([whether OpenMP will work in a package])
  AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
-  ${CC} -o conftest conftest.c ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
+  ${CXX} -o conftest conftest.cpp ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
  AC_MSG_RESULT([${ac_pkg_openmp}])
  if test "${ac_pkg_openmp}" = no; then
    OPENMP_CXXFLAGS=''
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/cb.gblinear.history.Rd
@@ -15,9 +15,11 @@ selected per iteration.}
 }
 \value{
 Results are stored in the \code{coefs} element of the closure.
-The \code{\link{xgb.gblinear.history}} convenience function provides an easy way to access it.
+The \code{\link{xgb.gblinear.history}} convenience function provides an easy
+way to access it.
 With \code{xgb.train}, it is either a dense of a sparse matrix.
-While with \code{xgb.cv}, it is a list (an element per each fold) of such matrices.
+While with \code{xgb.cv}, it is a list (an element per each fold) of such
+matrices.
 }
 \description{
 Callback closure for collecting the model coefficients history of a gblinear booster
@@ -38,7 +40,7 @@ Callback function expects the following values to be set in its calling frame:
 # without considering the 2nd order interactions:
 x <- model.matrix(Species ~ .^2, iris)[,-1]
 colnames(x)
-dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"))
+dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2)
 param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
              lambda = 0.0003, alpha = 0.0003, nthread = 2)
 # For 'shotgun', which is a default linear updater, using high eta values may result in
@@ -63,14 +65,14 @@ matplot(xgb.gblinear.history(bst), type = 'l')

 # For xgb.cv:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-             callbacks = list(cb.gblinear.history()))
+              callbacks = list(cb.gblinear.history()))
 # coefficients in the CV fold #3
 matplot(xgb.gblinear.history(bst)[[3]], type = 'l')


 #### Multiclass classification:
 #
-dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1)
+dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 2)
 param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
              lambda = 0.0003, alpha = 0.0003, nthread = 2)
 # For the default linear updater 'shotgun' it sometimes is helpful
--- a/R-package/man/dim.xgb.DMatrix.Rd
+++ b/R-package/man/dim.xgb.DMatrix.Rd
@@ -19,7 +19,7 @@ be directly used with an \code{xgb.DMatrix} object.
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
-dtrain <- xgb.DMatrix(train$data, label=train$label)
+dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)

 stopifnot(nrow(dtrain) == nrow(train$data))
 stopifnot(ncol(dtrain) == ncol(train$data))
--- a/R-package/man/dimnames.xgb.DMatrix.Rd
+++ b/R-package/man/dimnames.xgb.DMatrix.Rd
@@ -26,7 +26,7 @@ Since row names are irrelevant, it is recommended to use \code{colnames} directl
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
-dtrain <- xgb.DMatrix(train$data, label=train$label)
+dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
 dimnames(dtrain)
 colnames(dtrain)
 colnames(dtrain) <- make.names(1:ncol(train$data))
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -34,7 +34,7 @@ The \code{name} field can be one of the following:
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))

 labels <- getinfo(dtrain, 'label')
 setinfo(dtrain, 'label', 1-labels)
--- a/R-package/man/print.xgb.DMatrix.Rd
+++ b/R-package/man/print.xgb.DMatrix.Rd
@@ -19,7 +19,7 @@ Currently it displays dimensions and presence of info-fields and colnames.
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))

 dtrain
 print(dtrain, verbose=TRUE)
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -33,7 +33,7 @@ The \code{name} field can be one of the following:
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))

 labels <- getinfo(dtrain, 'label')
 setinfo(dtrain, 'label', 1-labels)
--- a/R-package/man/slice.xgb.DMatrix.Rd
+++ b/R-package/man/slice.xgb.DMatrix.Rd
@@ -28,7 +28,7 @@ original xgb.DMatrix object
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))

 dsub <- slice(dtrain, 1:42)
 labels1 <- getinfo(dsub, 'label')
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -38,7 +38,7 @@ Supported input file formats are either a LIBSVM text file or a binary file that
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -16,7 +16,7 @@ Save xgb.DMatrix object to binary file
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@@ -59,8 +59,8 @@ a rule on certain features."
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
-dtest <- with(agaricus.test, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))

 param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
 nrounds = 4
@@ -76,8 +76,12 @@ new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
 new.features.test <- xgb.create.features(model = bst, agaricus.test$data)

 # learning with new features
-new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
-new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+new.dtrain <- xgb.DMatrix(
+  data = new.features.train, label = agaricus.train$label, nthread = 2
+)
+new.dtest <- xgb.DMatrix(
+  data = new.features.test, label = agaricus.test$label, nthread = 2
+)
 watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)

--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -158,9 +158,9 @@ Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
-                  max_depth = 3, eta = 1, objective = "binary:logistic")
+             max_depth = 3, eta = 1, objective = "binary:logistic")
 print(cv)
 print(cv, verbose=TRUE)

--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -67,7 +67,7 @@ The "Yes" branches are marked by the "< split_value" label.
 The branches that also used for missing values are marked as bold
 (as in "carrying extra capacity").

-This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
+This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
 }
 \examples{
 data(agaricus.train, package='xgboost')
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -241,8 +241,8 @@ The following callbacks are automatically created when certain parameters are se
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')

-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
-dtest <- with(agaricus.test, xgb.DMatrix(data, label = label))
+dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
 watchlist <- list(train = dtrain, eval = dtest)

 ## A simple xgb.train example:
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -3,7 +3,7 @@ PKGROOT=../../
 ENABLE_STD_THREAD=1
 # _*_ mode: Makefile; _*_

-CXX_STD = CXX14
+CXX_STD = CXX17

 XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@@ -23,7 +23,6 @@ PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
 OBJECTS= \
    ./xgboost_R.o \
    ./xgboost_custom.o \
-    ./xgboost_assert.o \
    ./init.o \
    $(PKGROOT)/src/metric/metric.o \
    $(PKGROOT)/src/metric/elementwise_metric.o \
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -3,7 +3,7 @@ PKGROOT=../../
 ENABLE_STD_THREAD=0
 # _*_ mode: Makefile; _*_

-CXX_STD = CXX14
+CXX_STD = CXX17

 XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@@ -23,7 +23,6 @@ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)  -DDMLC_CMAKE_LITTLE_ENDIAN=1  $(SHLIB_PTHRE
 OBJECTS= \
    ./xgboost_R.o \
    ./xgboost_custom.o \
-    ./xgboost_assert.o \
    ./init.o \
    $(PKGROOT)/src/metric/metric.o \
    $(PKGROOT)/src/metric/elementwise_metric.o \
--- a/R-package/src/xgboost_assert.c
+++ b/R-package/src/xgboost_assert.c
@@ -1,26 +0,0 @@
-// Copyright (c) 2014 by Contributors
-#include <stdio.h>
-#include <stdarg.h>
-#include <Rinternals.h>
-
-// implements error handling
-void XGBoostAssert_R(int exp, const char *fmt, ...) {
-  char buf[1024];
-  if (exp == 0) {
-    va_list args;
-    va_start(args, fmt);
-    vsprintf(buf, fmt, args);
-    va_end(args);
-    error("AssertError:%s\n", buf);
-  }
-}
-void XGBoostCheck_R(int exp, const char *fmt, ...) {
-  char buf[1024];
-  if (exp == 0) {
-    va_list args;
-    va_start(args, fmt);
-    vsprintf(buf, fmt, args);
-    va_end(args);
-    error("%s\n", buf);
-  }
-}
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -178,17 +178,10 @@ function(xgboost_set_cuda_flags target)
      $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
  endif (MSVC)

-  if (PLUGIN_RMM)
-    set_target_properties(${target} PROPERTIES
-      CUDA_STANDARD 17
-      CUDA_STANDARD_REQUIRED ON
-      CUDA_SEPARABLE_COMPILATION OFF)
-  else ()
-    set_target_properties(${target} PROPERTIES
-      CUDA_STANDARD 14
-      CUDA_STANDARD_REQUIRED ON
-      CUDA_SEPARABLE_COMPILATION OFF)
-  endif (PLUGIN_RMM)
+  set_target_properties(${target} PROPERTIES
+    CUDA_STANDARD 17
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
 endfunction(xgboost_set_cuda_flags)

 macro(xgboost_link_nccl target)
@@ -205,17 +198,10 @@ endmacro(xgboost_link_nccl)

 # compile options
 macro(xgboost_target_properties target)
-  if (PLUGIN_RMM)
-    set_target_properties(${target} PROPERTIES
-      CXX_STANDARD 17
-      CXX_STANDARD_REQUIRED ON
-      POSITION_INDEPENDENT_CODE ON)
-  else ()
-    set_target_properties(${target} PROPERTIES
-      CXX_STANDARD 14
-      CXX_STANDARD_REQUIRED ON
-      POSITION_INDEPENDENT_CODE ON)
-  endif (PLUGIN_RMM)
+  set_target_properties(${target} PROPERTIES
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON)

  if (HIDE_CXX_SYMBOLS)
    #-- Hide all C++ symbols
--- a/demo/c-api/CMakeLists.txt
+++ b/demo/c-api/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(xgboost-c-examples)

 add_subdirectory(basic)
--- a/demo/c-api/external-memory/CMakeLists.txt
+++ b/demo/c-api/external-memory/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(external-memory-demo LANGUAGES C VERSION 0.0.1)

 find_package(xgboost REQUIRED)
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(inference-demo LANGUAGES C VERSION 0.0.1)
 find_package(xgboost REQUIRED)

--- a/2
+++ b/2
--- a/doc/c++.rst
+++ b/doc/c++.rst
@@ -8,5 +8,5 @@ As a result it's changing quite often and we don't maintain its stability.  Alon
 plugin system (see ``plugin/example`` in XGBoost's source tree), users can utilize some
 existing c++ headers for gaining more access to the internal of XGBoost.

-* `C++ interface documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/files.html>`_
+* `C++ interface documentation (latest master branch) <./dev/files.html>`_
 * `C++ interface documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/files.html>`_
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -10,7 +10,7 @@ simply look at function comments in ``include/xgboost/c_api.h``. The reference i
 to sphinx with the help of breathe, which doesn't contain links to examples but might be
 easier to read. For the original doxygen pages please visit:

-* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
+* `C API documentation (latest master branch) <./dev/c__api_8h.html>`_
 * `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_

 ***************
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -11,54 +11,107 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
-from subprocess import call
-from sh.contrib import git
-import urllib.request
-from urllib.error import HTTPError
-import sys
-import re
 import os
+import re
+import shutil
 import subprocess
+import sys
+import tarfile
+import urllib.request
+import warnings
+from urllib.error import HTTPError

-git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
+from sh.contrib import git
+
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
+TMP_DIR = os.path.join(CURR_PATH, "tmp")
+DOX_DIR = "doxygen"
+
+
+def run_doxygen():
+    """Run the doxygen make command in the designated folder."""
+    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
+    if os.path.exists(TMP_DIR):
+        print(f"Delete directory {TMP_DIR}")
+        shutil.rmtree(TMP_DIR)
+    else:
+        print(f"Create directory {TMP_DIR}")
+        os.mkdir(TMP_DIR)
+    try:
+        os.chdir(PROJECT_ROOT)
+        if not os.path.exists(DOX_DIR):
+            os.mkdir(DOX_DIR)
+        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
+        print(
+            "Build doxygen at {}".format(
+                os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen")
+            )
+        )
+        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
+        subprocess.check_call(["ninja", "doc_doxygen"])
+
+        src = os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen", "html")
+        dest = os.path.join(TMP_DIR, "dev")
+        print(f"Copy directory {src} -> {dest}")
+        shutil.copytree(src, dest)
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+    finally:
+        os.chdir(curdir)
+
+
+def is_readthedocs_build():
+    if os.environ.get("READTHEDOCS", None) == "True":
+        return True
+    warnings.warn(
+        "Skipping Doxygen build... You won't have documentation for C/C++ functions. "
+        "Set environment variable READTHEDOCS=True if you want to build Doxygen. "
+        "(If you do opt in, make sure to install Doxygen, Graphviz, CMake, and C++ compiler "
+        "on your system.)"
+    )
+    return False
+
+
+if is_readthedocs_build():
+    run_doxygen()
+
+
+git_branch = os.getenv("SPHINX_GIT_BRANCH", default=None)
 if not git_branch:
    # If SPHINX_GIT_BRANCH environment variable is not given, run git
    # to determine branch name
    git_branch = [
-        re.sub(r'origin/', '', x.lstrip(' ')) for x in str(
-            git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
+        re.sub(r"origin/", "", x.lstrip(" "))
+        for x in str(git.branch("-r", "--contains", "HEAD")).rstrip("\n").split("\n")
    ]
-    git_branch = [x for x in git_branch if 'HEAD' not in x]
+    git_branch = [x for x in git_branch if "HEAD" not in x]
 else:
    git_branch = [git_branch]
-print('git_branch = {}'.format(git_branch[0]))
+print("git_branch = {}".format(git_branch[0]))

 try:
    filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(
-            git_branch[0]))
-    call(
-        'if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'
-        .format(filename),
-        shell=True)
+        f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{git_branch[0]}.tar.bz2"
+    )
+    if not os.path.exists(TMP_DIR):
+        print(f"Create directory {TMP_DIR}")
+        os.mkdir(TMP_DIR)
+    jvm_doc_dir = os.path.join(TMP_DIR, "jvm")
+    if os.path.exists(jvm_doc_dir):
+        print(f"Delete directory {jvm_doc_dir}")
+        shutil.rmtree(jvm_doc_dir)
+    print(f"Create directory {jvm_doc_dir}")
+    os.mkdir(jvm_doc_dir)
+
+    with tarfile.open(filename, "r:bz2") as t:
+        t.extractall(jvm_doc_dir)
 except HTTPError:
-    print('JVM doc not found. Skipping...')
-try:
-    filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/doxygen/{}.tar.bz2'.
-        format(git_branch[0]))
-    call(
-        'mkdir -p tmp/dev; cd tmp/dev; tar xvf {}; mv doc_doxygen/html/* .; rm -rf doc_doxygen'
-        .format(filename),
-        shell=True)
-except HTTPError:
-    print('C API doc not found. Skipping...')
+    print("JVM doc not found. Skipping...")

 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
 libpath = os.path.join(PROJECT_ROOT, "python-package/")
 sys.path.insert(0, libpath)
 sys.path.insert(0, CURR_PATH)
@@ -81,50 +134,56 @@ release = xgboost.__version__
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
 extensions = [
-    'matplotlib.sphinxext.plot_directive',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.intersphinx',
+    "matplotlib.sphinxext.plot_directive",
+    "sphinxcontrib.jquery",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.intersphinx",
    "sphinx_gallery.gen_gallery",
-    'breathe',
-    'recommonmark'
+    "breathe",
+    "recommonmark",
 ]

 sphinx_gallery_conf = {
    # path to your example scripts
    "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
    # path to where to save gallery generated output
-    "gallery_dirs": ["python/examples", "python/dask-examples", "python/survival-examples"],
+    "gallery_dirs": [
+        "python/examples",
+        "python/dask-examples",
+        "python/survival-examples",
+    ],
    "matplotlib_animations": True,
 }

 autodoc_typehints = "description"

-graphviz_output_format = 'png'
-plot_formats = [('svg', 300), ('png', 100), ('hires.png', 300)]
+graphviz_output_format = "png"
+plot_formats = [("svg", 300), ("png", 100), ("hires.png", 300)]
 plot_html_show_source_link = False
 plot_html_show_formats = False

 # Breathe extension variables
-DOX_DIR = "doxygen"
-breathe_projects = {
-    "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
-}
+breathe_projects = {}
+if is_readthedocs_build():
+    breathe_projects = {
+        "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
+    }
 breathe_default_project = "xgboost"

 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]

 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]

 # The encoding of source files.
 # source_encoding = 'utf-8-sig'

 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -133,7 +192,7 @@ master_doc = 'index'
 # Usually you set "language" from the command line for these cases.
 language = "en"

-autoclass_content = 'both'
+autoclass_content = "both"

 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@@ -143,8 +202,10 @@ autoclass_content = 'both'

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-html_extra_path = ['./tmp']
+exclude_patterns = ["_build"]
+html_extra_path = []
+if is_readthedocs_build():
+    html_extra_path = [TMP_DIR]

 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -162,7 +223,7 @@ html_extra_path = ['./tmp']
 # show_authors = False

 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"

 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
@@ -185,27 +246,24 @@ html_logo = "https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/lo

 html_css_files = ["css/custom.css"]

-html_sidebars = {
-  '**': ['logo-text.html', 'globaltoc.html', 'searchbox.html']
-}
+html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "searchbox.html"]}

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]

 # Output file base name for HTML help builder.
-htmlhelp_basename = project + 'doc'
+htmlhelp_basename = project + "doc"

 # -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
-}
+latex_elements = {}

 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  (master_doc, '%s.tex' % project, project, author, 'manual'),
+    (master_doc, "%s.tex" % project, project, author, "manual"),
 ]

 intersphinx_mapping = {
@@ -220,30 +278,5 @@ intersphinx_mapping = {
 }


-# hook for doxygen
-def run_doxygen():
-    """Run the doxygen make command in the designated folder."""
-    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
-    try:
-        os.chdir(PROJECT_ROOT)
-        if not os.path.exists(DOX_DIR):
-            os.mkdir(DOX_DIR)
-        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
-        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
-        subprocess.check_call(["ninja", "doc_doxygen"])
-    except OSError as e:
-        sys.stderr.write("doxygen execution failed: %s" % e)
-    finally:
-        os.chdir(curdir)
-
-
-def generate_doxygen_xml(app):
-    """Run the doxygen make commands if we're on the ReadTheDocs server"""
-    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-    if read_the_docs_build:
-        run_doxygen()
-
-
 def setup(app):
-    app.add_css_file('custom.css')
-    app.connect("builder-inited", generate_doxygen_xml)
+    app.add_css_file("custom.css")
--- a/doc/contrib/release.rst
+++ b/doc/contrib/release.rst
@@ -4,7 +4,7 @@ XGBoost Release Policy
 =======================

 Versioning Policy
---------------------------
+-----------------

 Starting from XGBoost 1.0.0, each XGBoost release will be versioned as [MAJOR].[FEATURE].[MAINTENANCE]

@@ -34,6 +34,20 @@ Making a Release

   + The CRAN package is maintained by `Tong He <https://github.com/hetong007>`_ and `Jiaming Yuan <https://github.com/trivialfis>`__.

-     Before submitting a release, one should test the package on `R-hub <https://builder.r-hub.io/>`__ and `win-builder <https://win-builder.r-project.org/>`__ first.  Please note that the R-hub Windows instance doesn't have the exact same environment as the one hosted on win-builder.
-
   + The Maven package is maintained by `Nan Zhu <https://github.com/CodingCat>`_ and `Hyunsu Cho <https://github.com/hcho3>`_.
+
+
+R CRAN Package
+--------------
+Before submitting a release, one should test the package on `R-hub <https://builder.r-hub.io/>`__ and `win-builder <https://win-builder.r-project.org/>`__ first.  Please note that the R-hub Windows instance doesn't have the exact same environment as the one hosted on win-builder.
+
+According to the `CRAN policy <https://cran.r-project.org/web/packages/policies.html>`__:
+
+    If running a package uses multiple threads/cores it must never use more than two simultaneously: the check farm is a shared resource and will typically be running many checks simultaneously.
+
+We need to check the number of CPUs used in examples. Export ``_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_=2.5`` before running ``R CMD check --as-cran`` `[1] <#references>`__ and make sure the machine you are using has enough CPU cores to reveal any potential policy violation.
+
+References
+----------
+
+[1] https://stat.ethz.ch/pipermail/r-package-devel/2022q4/008610.html
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -44,8 +44,7 @@ General Parameters
 * ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]

  - When set to True, XGBoost will perform validation of input parameters to check whether
-    a parameter is used or not.  The feature is still experimental.  It's expected to have
-    some false positives.
+    a parameter is used or not.

 * ``nthread`` [default to maximum number of threads available if not set]

@@ -233,24 +232,21 @@ Parameters for Categorical Feature
 These parameters are only used for training with categorical data. See
 :doc:`/tutorials/categorical` for more information.

+.. note:: These parameters are experimental. ``exact`` tree method is not yet supported.
+
+
 * ``max_cat_to_onehot``

  .. versionadded:: 1.6.0

-  .. note:: This parameter is experimental. ``exact`` tree method is not yet supported.
-
  - A threshold for deciding whether XGBoost should use one-hot encoding based split for
    categorical data.  When number of categories is lesser than the threshold then one-hot
    encoding is chosen, otherwise the categories will be partitioned into children nodes.
-    Only relevant for regression and binary classification. Also, ``exact`` tree method is
-    not supported

 * ``max_cat_threshold``

  .. versionadded:: 1.7.0

-  .. note:: This parameter is experimental. ``exact`` tree method is not yet supported.
-
  - Maximum number of categories considered for each split. Used only by partition-based
    splits for preventing over-fitting.

--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -25,9 +25,6 @@ Core Data Structure
 .. autoclass:: xgboost.QuantileDMatrix
    :show-inheritance:

-.. autoclass:: xgboost.DeviceQuantileDMatrix
-    :show-inheritance:
-
 .. autoclass:: xgboost.Booster
    :members:
    :show-inheritance:
@@ -115,7 +112,7 @@ Dask API
    :inherited-members:
    :show-inheritance:

-.. autoclass:: xgboost.dask.DaskDeviceQuantileDMatrix
+.. autoclass:: xgboost.dask.DaskQuantileDMatrix
    :members:
    :inherited-members:
    :show-inheritance:
@@ -176,3 +173,13 @@ PySpark API
    :members:
    :inherited-members:
    :show-inheritance:
+
+.. autoclass:: xgboost.spark.SparkXGBRanker
+    :members:
+    :inherited-members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.spark.SparkXGBRankerModel
+    :members:
+    :inherited-members:
+    :show-inheritance:
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -45,7 +45,7 @@ Use ``find_package()`` and ``target_link_libraries()`` in your application's CMa

 .. code-block:: cmake

-    cmake_minimum_required(VERSION 3.13)
+    cmake_minimum_required(VERSION 3.18)
    project(your_project_name LANGUAGES C CXX VERSION your_project_version)
    find_package(xgboost REQUIRED)
    add_executable(your_project_name /path/to/project_file.c)
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -138,11 +138,11 @@ Miscellaneous

 By default, XGBoost assumes input categories are integers starting from 0 till the number
 of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
-values due to mistakes or missing values. It can be negative value, integer values that
-can not be accurately represented by 32-bit floating point, or values that are larger than
-actual number of unique categories.  During training this is validated but for prediction
-it's treated as the same as missing value for performance reasons.  Lastly, missing values
-are treated as the same as numerical features (using the learned split direction).
+values due to mistakes or missing values in training dataset. It can be negative value,
+integer values that can not be accurately represented by 32-bit floating point, or values
+that are larger than actual number of unique categories.  During training this is
+validated but for prediction it's treated as the same as not-chosen category for
+performance reasons.


 **********
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -564,7 +564,7 @@ Here are some pratices on reducing memory usage with dask and xgboost.
  nice summary.

 - When using GPU input, like dataframe loaded by ``dask_cudf``, you can try
-  :py:class:`xgboost.dask.DaskDeviceQuantileDMatrix` as a drop in replacement for ``DaskDMatrix``
+  :py:class:`xgboost.dask.DaskQuantileDMatrix` as a drop in replacement for ``DaskDMatrix``
  to reduce overall memory usage.  See
  :ref:`sphx_glr_python_dask-examples_gpu_training.py` for an example.

--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -43,10 +43,10 @@ in spark estimator, and some parameters are replaced with pyspark specific param
 such as `weight_col`, `validation_indicator_col`, `use_gpu`, for details please see
 `SparkXGBRegressor` doc.

-The following code snippet shows how to train a spark xgboost regressor model,
-first we need to prepare a training dataset as a spark dataframe contains
-"label" column and "features" column(s), the "features" column(s) must be `pyspark.ml.linalg.Vector`
-type or spark array type or a list of feature column names.
+The following code snippet shows how to train a spark xgboost regressor model, first we
+need to prepare a training dataset as a spark dataframe contains "label" column and
+"features" column(s), the "features" column(s) must be ``pyspark.ml.linalg.Vector`` type
+or spark array type or a list of feature column names.


 .. code-block:: python
@@ -54,10 +54,10 @@ type or spark array type or a list of feature column names.
  xgb_regressor_model = xgb_regressor.fit(train_spark_dataframe)


-The following code snippet shows how to predict test data using a spark xgboost regressor model,
-first we need to prepare a test dataset as a spark dataframe contains
-"features" and "label" column, the "features" column must be `pyspark.ml.linalg.Vector`
-type or spark array type.
+The following code snippet shows how to predict test data using a spark xgboost regressor
+model, first we need to prepare a test dataset as a spark dataframe contains "features"
+and "label" column, the "features" column must be ``pyspark.ml.linalg.Vector`` type or
+spark array type.

 .. code-block:: python

@@ -107,8 +107,8 @@ virtualenv and pip:
  python -m venv xgboost_env
  source xgboost_env/bin/activate
  pip install pyarrow pandas venv-pack xgboost
-  # https://rapids.ai/pip.html#install
-  pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+  # https://docs.rapids.ai/install#pip-install
+  pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com
  venv-pack -o xgboost_env.tar.gz

 With Conda:
@@ -240,7 +240,7 @@ additional spark configurations and dependencies:
    --master spark://<master-ip>:7077 \
    --conf spark.executor.resource.gpu.amount=1 \
    --conf spark.task.resource.gpu.amount=1 \
-    --packages com.nvidia:rapids-4-spark_2.12:22.08.0 \
+    --packages com.nvidia:rapids-4-spark_2.12:23.04.0 \
    --conf spark.plugins=com.nvidia.spark.SQLPlugin \
    --conf spark.sql.execution.arrow.maxRecordsPerBatch=1000000 \
    --archives xgboost_env.tar.gz#environment \
--- a/2
+++ b/2
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -287,11 +287,22 @@ class TCPSocket {
 #elif defined(__APPLE__)
    return domain_;
 #elif defined(__unix__)
+#ifndef __PASE__
    std::int32_t domain;
    socklen_t len = sizeof(domain);
    xgboost_CHECK_SYS_CALL(
        getsockopt(handle_, SOL_SOCKET, SO_DOMAIN, reinterpret_cast<char *>(&domain), &len), 0);
    return ret_iafamily(domain);
+#else
+    struct sockaddr sa;
+    socklen_t sizeofsa = sizeof(sa);
+    xgboost_CHECK_SYS_CALL(
+      getsockname(handle_, &sa, &sizeofsa), 0);
+    if (sizeofsa < sizeof(uchar_t)*2) {
+      return ret_iafamily(AF_INET);
+    }
+    return ret_iafamily(sa.sa_family);
+#endif   // __PASE__
 #else
    LOG(FATAL) << "Unknown platform.";
    return ret_iafamily(AF_INET);
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -508,7 +508,7 @@ class RegTree : public Model {
     * \brief drop the trace after fill, must be called after fill.
     * \param inst The sparse instance to drop.
     */
-    void Drop(const SparsePage::Inst& inst);
+    void Drop();
    /*!
     * \brief returns the size of the feature vector
     * \return the size of the feature vector
@@ -709,13 +709,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
  has_missing_ = data_.size() != feature_count;
 }

-inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
-  for (auto const& entry : inst) {
-    if (entry.index >= data_.size()) {
-      continue;
-    }
-    data_[entry.index].flag = -1;
-  }
+inline void RegTree::FVec::Drop() {
+  Entry e{};
+  e.flag = -1;
+  std::fill_n(data_.data(), data_.size(), e);
  has_missing_ = true;
 }

--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -6,6 +6,6 @@

 #define XGBOOST_VER_MAJOR 1
 #define XGBOOST_VER_MINOR 7
-#define XGBOOST_VER_PATCH 0
+#define XGBOOST_VER_PATCH 6

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@

    <groupId>ml.dmlc</groupId>
    <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.7.1</version>
+    <version>1.7.6</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.1</version>
+        <version>1.7.6</version>
    </parent>
    <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.7.1</version>
+    <version>1.7.6</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.7.1</version>
+            <version>1.7.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.7.1</version>
+            <version>1.7.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.1</version>
+        <version>1.7.6</version>
    </parent>
    <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.7.1</version>
+    <version>1.7.6</version>
    <build>
        <plugins>
            <plugin>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.7.1</version>
+            <version>1.7.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.1</version>
+        <version>1.7.6</version>
    </parent>
    <artifactId>xgboost4j-gpu_2.12</artifactId>
-    <version>1.7.1</version>
+    <version>1.7.6</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
@@ -1,7 +1,7 @@
 #include <jni.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>

 #include "../../../../src/common/device_helpers.cuh"
+#include "../../../../src/common/cuda_pinned_allocator.h"
 #include "../../../../src/data/array_interface.h"
 #include "jvm_utils.h"
 #include <xgboost/c_api.h>
@@ -131,7 +131,7 @@ class DataIteratorProxy {
  bool cache_on_host_{true}; // TODO(Bobby): Make this optional.

  template <typename T>
-  using Alloc = thrust::system::cuda::experimental::pinned_allocator<T>;
+  using Alloc = xgboost::common::cuda::pinned_allocator<T>;
  template <typename U>
  using HostVector = std::vector<U, Alloc<U>>;

--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.1</version>
+        <version>1.7.6</version>
    </parent>
    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>1.7.1</version>
+            <version>1.7.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.1</version>
+        <version>1.7.6</version>
    </parent>
    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.7.1</version>
+            <version>1.7.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-tester/get_iris.py
+++ b/jvm-packages/xgboost4j-tester/get_iris.py
@@ -1,9 +1,9 @@
-from sklearn.datasets import load_iris
 import numpy as np
 import pandas
+from sklearn.datasets import load_iris

 X, y = load_iris(return_X_y=True)
-y = y.astype(np.int)
+y = y.astype(np.int32)
 df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
 class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
 df['class'] = np.vectorize(class_id_to_name.get)(y)
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.1</version>
+        <version>1.7.6</version>
    </parent>
    <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.7.1</version>
+    <version>1.7.6</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -15,7 +15,7 @@ if (PLUGIN_UPDATER_ONEAPI)
  target_link_libraries(oneapi_plugin PUBLIC -fsycl)
  set_target_properties(oneapi_plugin PROPERTIES
    COMPILE_FLAGS -fsycl
-    CXX_STANDARD 14
+    CXX_STANDARD 17
    CXX_STANDARD_REQUIRED ON
    POSITION_INDEPENDENT_CODE ON)
  if (USE_OPENMP)
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-1.7.1
+1.7.6
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -36,13 +36,13 @@ try:

    PANDAS_INSTALLED = True
 except ImportError:
-
    MultiIndex = object
    DataFrame = object
    Series = object
    pandas_concat = None
    PANDAS_INSTALLED = False

+
 # sklearn
 try:
    from sklearn.base import BaseEstimator as XGBModelBase
@@ -72,6 +72,22 @@ except ImportError:
    XGBStratifiedKFold = None


+_logger = logging.getLogger(__name__)
+
+
+def is_cudf_available() -> bool:
+    """Check cuDF package available or not"""
+    if importlib.util.find_spec("cudf") is None:
+        return False
+    try:
+        import cudf
+
+        return True
+    except ImportError:
+        _logger.exception("Importing cuDF failed, use DMatrix instead of QDM")
+        return False
+
+
 class XGBoostLabelEncoder(LabelEncoder):
    """Label encoder with JSON serialization methods."""

@@ -144,6 +160,7 @@ def concat(value: Sequence[_T]) -> _T:  # pylint: disable=too-many-return-statem
 # `importlib.utils`, except it's unclear from its document on how to use it.  This one
 # seems to be easy to understand and works out of box.

+
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2172,6 +2172,7 @@ class Booster:
        )
        return _prediction_output(shape, dims, preds, False)

+    # pylint: disable=too-many-statements
    def inplace_predict(
        self,
        data: DataType,
@@ -2192,10 +2193,10 @@ class Booster:

        .. code-block:: python

-            booster.set_param({'predictor': 'gpu_predictor'})
+            booster.set_param({"predictor": "gpu_predictor"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({'predictor': 'cpu_predictor})
+            booster.set_param({"predictor": "cpu_predictor"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2301,14 +2302,16 @@ class Booster:
            )
            return _prediction_output(shape, dims, preds, False)
        if isinstance(data, scipy.sparse.csr_matrix):
-            csr = data
+            from .data import _transform_scipy_csr
+
+            data = _transform_scipy_csr(data)
            _check_call(
                _LIB.XGBoosterPredictFromCSR(
                    self.handle,
-                    _array_interface(csr.indptr),
-                    _array_interface(csr.indices),
-                    _array_interface(csr.data),
-                    c_bst_ulong(csr.shape[1]),
+                    _array_interface(data.indptr),
+                    _array_interface(data.indices),
+                    _array_interface(data.data),
+                    c_bst_ulong(data.shape[1]),
                    from_pystr_to_cstr(json.dumps(args)),
                    p_handle,
                    ctypes.byref(shape),
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -853,7 +853,7 @@ async def _get_rabit_args(
        sched_addr = None

    # make sure all workers are online so that we can obtain reliable scheduler_info
-    client.wait_for_workers(n_workers)
+    await client.wait_for_workers(n_workers)  # type: ignore
    env = await client.run_on_scheduler(
        _start_tracker, n_workers, sched_addr, user_addr
    )
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -30,6 +30,7 @@ from .core import (
    c_array,
    c_str,
    from_pystr_to_cstr,
+    make_jcargs,
 )

 DispatchedDataBackendReturnType = Tuple[
@@ -80,6 +81,21 @@ def _array_interface(data: np.ndarray) -> bytes:
    return interface_str


+def _transform_scipy_csr(data: DataType) -> DataType:
+    from scipy.sparse import csr_matrix
+
+    indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype)
+    indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype)
+    values, _ = _ensure_np_dtype(data.data, data.data.dtype)
+    if (
+        indptr is not data.indptr
+        or indices is not data.indices
+        or values is not data.data
+    ):
+        data = csr_matrix((values, indices, indptr), shape=data.shape)
+    return data
+
+
 def _from_scipy_csr(
    data: DataType,
    missing: FloatCompatible,
@@ -93,18 +109,14 @@ def _from_scipy_csr(
            f"length mismatch: {len(data.indices)} vs {len(data.data)}"
        )
    handle = ctypes.c_void_p()
-    args = {
-        "missing": float(missing),
-        "nthread": int(nthread),
-    }
-    config = bytes(json.dumps(args), "utf-8")
+    data = _transform_scipy_csr(data)
    _check_call(
        _LIB.XGDMatrixCreateFromCSR(
            _array_interface(data.indptr),
            _array_interface(data.indices),
            _array_interface(data.data),
            c_bst_ulong(data.shape[1]),
-            config,
+            make_jcargs(missing=float(missing), nthread=int(nthread)),
            ctypes.byref(handle),
        )
    )
@@ -153,12 +165,13 @@ def _is_numpy_array(data: DataType) -> bool:


 def _ensure_np_dtype(
-    data: DataType,
-    dtype: Optional[NumpyDType]
+    data: DataType, dtype: Optional[NumpyDType]
 ) -> Tuple[np.ndarray, Optional[NumpyDType]]:
    if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
-        data = data.astype(np.float32, copy=False)
        dtype = np.float32
+        data = data.astype(dtype, copy=False)
+    if not data.flags.aligned:
+        data = np.require(data, requirements="A")
    return data, dtype


@@ -1197,11 +1210,13 @@ def _proxy_transform(
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
+        data = _transform_scipy_csr(data)
        return data, None, feature_names, feature_types
    if _is_pandas_df(data):
        arr, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
+        arr, _ = _ensure_np_dtype(arr, arr.dtype)
        return arr, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))

--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -674,7 +674,7 @@ class XGBModel(XGBModelBase):
                    self.kwargs = {}
                self.kwargs[key] = value

-        if hasattr(self, "_Booster"):
+        if self.__sklearn_is_fitted__():
            parameters = self.get_xgb_params()
            self.get_booster().set_param(parameters)

@@ -701,39 +701,12 @@ class XGBModel(XGBModelBase):
                np.iinfo(np.int32).max
            )

-        def parse_parameter(value: Any) -> Optional[Union[int, float, str]]:
-            for t in (int, float, str):
-                try:
-                    ret = t(value)
-                    return ret
-                except ValueError:
-                    continue
-            return None
-
-        # Get internal parameter values
-        try:
-            config = json.loads(self.get_booster().save_config())
-            stack = [config]
-            internal = {}
-            while stack:
-                obj = stack.pop()
-                for k, v in obj.items():
-                    if k.endswith("_param"):
-                        for p_k, p_v in v.items():
-                            internal[p_k] = p_v
-                    elif isinstance(v, dict):
-                        stack.append(v)
-
-            for k, v in internal.items():
-                if k in params and params[k] is None:
-                    params[k] = parse_parameter(v)
-        except ValueError:
-            pass
        return params

    def get_xgb_params(self) -> Dict[str, Any]:
        """Get xgboost specific parameters."""
-        params = self.get_params()
+        params: Dict[str, Any] = self.get_params()
+
        # Parameters that should not go into native learner.
        wrapper_specific = {
            "importance_type",
@@ -750,6 +723,7 @@ class XGBModel(XGBModelBase):
        for k, v in params.items():
            if k not in wrapper_specific and not callable(v):
                filtered[k] = v
+
        return filtered

    def get_num_boosting_rounds(self) -> int:
@@ -1070,7 +1044,7 @@ class XGBModel(XGBModelBase):
        # error with incompatible data type.
        # Inplace predict doesn't handle as many data types as DMatrix, but it's
        # sufficient for dask interface where input is simpiler.
-        predictor = self.get_params().get("predictor", None)
+        predictor = self.get_xgb_params().get("predictor", None)
        if predictor in ("auto", None) and self.booster != "gblinear":
            return True
        return False
@@ -1336,7 +1310,7 @@ class XGBModel(XGBModelBase):
        -------
        coef_ : array of shape ``[n_features]`` or ``[n_classes, n_features]``
        """
-        if self.get_params()["booster"] != "gblinear":
+        if self.get_xgb_params()["booster"] != "gblinear":
            raise AttributeError(
                f"Coefficients are not defined for Booster type {self.booster}"
            )
@@ -1366,7 +1340,7 @@ class XGBModel(XGBModelBase):
        -------
        intercept_ : array of shape ``(1,)`` or ``[n_classes]``
        """
-        if self.get_params()["booster"] != "gblinear":
+        if self.get_xgb_params()["booster"] != "gblinear":
            raise AttributeError(
                f"Intercept (bias) is not defined for Booster type {self.booster}"
            )
--- a/python-package/xgboost/spark/init.py
+++ b/python-package/xgboost/spark/init.py
@@ -1,6 +1,5 @@
 # type: ignore
-"""PySpark XGBoost integration interface
-"""
+"""PySpark XGBoost integration interface"""

 try:
    import pyspark
@@ -11,6 +10,7 @@ from .estimator import (
    SparkXGBClassifier,
    SparkXGBClassifierModel,
    SparkXGBRanker,
+    SparkXGBRankerModel,
    SparkXGBRegressor,
    SparkXGBRegressorModel,
 )
@@ -21,4 +21,5 @@ __all__ = [
    "SparkXGBRegressor",
    "SparkXGBRegressorModel",
    "SparkXGBRanker",
+    "SparkXGBRankerModel",
 ]
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1,7 +1,7 @@
 # type: ignore
 """Xgboost pyspark integration submodule for core code."""
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
-# pylint: disable=too-few-public-methods, too-many-lines
+# pylint: disable=too-few-public-methods, too-many-lines, too-many-branches
 import json
 from typing import Iterator, Optional, Tuple

@@ -32,6 +32,7 @@ from pyspark.sql.types import (
    ShortType,
 )
 from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
+from xgboost.compat import is_cudf_available
 from xgboost.core import Booster
 from xgboost.training import train as worker_train

@@ -139,6 +140,13 @@ _unsupported_predict_params = {
 }


+# TODO: supply hint message for all other unsupported params.
+_unsupported_params_hint_message = {
+    "enable_categorical": "`xgboost.spark` estimators do not have 'enable_categorical' param, "
+    "but you can set `feature_types` param and mark categorical features with 'c' string."
+}
+
+
 class _SparkXGBParams(
    HasFeaturesCol,
    HasLabelCol,
@@ -522,7 +530,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                    or k in _unsupported_predict_params
                    or k in _unsupported_train_params
                ):
-                    raise ValueError(f"Unsupported param '{k}'.")
+                    err_msg = _unsupported_params_hint_message.get(
+                        k, f"Unsupported param '{k}'."
+                    )
+                    raise ValueError(err_msg)
                _extra_params[k] = v
        _existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
        self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
@@ -728,6 +739,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            else:
                dataset = dataset.repartition(num_workers)

+        if self.isDefined(self.qid_col) and self.getOrDefault(self.qid_col):
+            # XGBoost requires qid to be sorted for each partition
+            dataset = dataset.sortWithinPartitions(alias.qid, ascending=True)
+
        train_params = self._get_distributed_train_params(dataset)
        booster_params, train_call_kwargs_params = self._get_xgb_train_call_args(
            train_params
@@ -744,6 +759,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            "feature_weights": self.getOrDefault(self.feature_weights),
            "missing": float(self.getOrDefault(self.missing)),
        }
+        if dmatrix_kwargs["feature_types"] is not None:
+            dmatrix_kwargs["enable_categorical"] = True
        booster_params["nthread"] = cpu_per_task
        use_gpu = self.getOrDefault(self.use_gpu)

@@ -755,7 +772,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            k: v for k, v in train_call_kwargs_params.items() if v is not None
        }
        dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
-        use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+
+        use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")

        def _train_booster(pandas_df_iter):
            """Takes in an RDD partition and outputs a booster for that partition after
@@ -769,6 +787,15 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

            gpu_id = None

+            # If cuDF is not installed, then using DMatrix instead of QDM,
+            # because without cuDF, DMatrix performs better than QDM.
+            # Note: Checking `is_cudf_available` in spark worker side because
+            # spark worker might has different python environment with driver side.
+            if use_gpu:
+                use_qdm = use_hist and is_cudf_available()
+            else:
+                use_qdm = use_hist
+
            if use_qdm and (booster_params.get("max_bin", None) is not None):
                dmatrix_kwargs["max_bin"] = booster_params["max_bin"]

@@ -839,7 +866,11 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
        result_xgb_model = self._convert_to_sklearn_model(
            bytearray(booster, "utf-8"), config
        )
-        return self._copyValues(self._create_pyspark_model(result_xgb_model))
+        spark_model = self._create_pyspark_model(result_xgb_model)
+        # According to pyspark ML convention, the model uid should be the same
+        # with estimator uid.
+        spark_model._resetUid(self.uid)
+        return self._copyValues(spark_model)

    def write(self):
        """
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.3)
+cmake_minimum_required(VERSION 3.18)

 find_package(Threads REQUIRED)

--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -48,20 +48,21 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
  return cat < 0 || cat >= kMaxCat;
 }

-/* \brief Whether should it traverse to left branch of a tree.
+/**
+ * \brief Whether should it traverse to left branch of a tree.
 *
- *  For one hot split, go to left if it's NOT the matching category.
+ *   Go to left if it's NOT the matching category, which matches one-hot encoding.
 */
-template <bool validate = true>
-inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat, bool dft_left) {
+inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
  KCatBitField const s_cats(cats);
-  // FIXME: Size() is not accurate since it represents the size of bit set instead of
-  // actual number of categories.
-  if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
-    return dft_left;
+  if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
+    return true;
  }

  auto pos = KCatBitField::ToBitPos(cat);
+  // If the input category is larger than the size of the bit field, it implies that the
+  // category is not chosen. Otherwise the bit field would have the category instead of
+  // being smaller than the category value.
  if (pos.int_pos >= cats.size()) {
    return true;
  }
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
    feature_offsets_[fid] = accum_index;
  }

-  SetTypeSize(gmat.max_num_bins);
+  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
  index_.resize(storage_size, 0);
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -0,0 +1,91 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ * \file common.h
+ * \brief cuda pinned allocator for usage with thrust containers
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <limits>
+
+#include "common.h"
+
+namespace xgboost {
+namespace common {
+namespace cuda {
+
+// \p pinned_allocator is a CUDA-specific host memory allocator
+//  that employs \c cudaMallocHost for allocation.
+//
+// This implementation is ported from the experimental/pinned_allocator
+// that Thrust used to provide.
+//
+//  \see https://en.cppreference.com/w/cpp/memory/allocator
+template <typename T>
+class pinned_allocator;
+
+template <>
+class pinned_allocator<void> {
+ public:
+  using value_type      = void;            // NOLINT: The type of the elements in the allocator
+  using pointer         = void*;           // NOLINT: The type returned by address() / allocate()
+  using const_pointer   = const void*;     // NOLINT: The type returned by address()
+  using size_type       = std::size_t;     // NOLINT: The type used for the size of the allocation
+  using difference_type = std::ptrdiff_t;  // NOLINT: The type of the distance between two pointers
+
+  template <typename U>
+  struct rebind {                       // NOLINT
+    using other = pinned_allocator<U>;  // NOLINT: The rebound type
+  };
+};
+
+
+template <typename T>
+class pinned_allocator {
+ public:
+  using value_type      = T;               // NOLINT: The type of the elements in the allocator
+  using pointer         = T*;              // NOLINT: The type returned by address() / allocate()
+  using const_pointer   = const T*;        // NOLINT: The type returned by address()
+  using reference       = T&;              // NOLINT: The parameter type for address()
+  using const_reference = const T&;        // NOLINT: The parameter type for address()
+  using size_type       = std::size_t;     // NOLINT: The type used for the size of the allocation
+  using difference_type = std::ptrdiff_t;  // NOLINT: The type of the distance between two pointers
+
+  template <typename U>
+  struct rebind {                       // NOLINT
+    using other = pinned_allocator<U>;  // NOLINT: The rebound type
+  };
+
+  XGBOOST_DEVICE inline pinned_allocator() {}; // NOLINT: host/device markup ignored on defaulted functions
+  XGBOOST_DEVICE inline ~pinned_allocator() {} // NOLINT: host/device markup ignored on defaulted functions
+  XGBOOST_DEVICE inline pinned_allocator(pinned_allocator const&) {} // NOLINT: host/device markup ignored on defaulted functions
+
+
+  template <typename U>
+  XGBOOST_DEVICE inline pinned_allocator(pinned_allocator<U> const&) {} // NOLINT
+
+  XGBOOST_DEVICE inline pointer address(reference r) { return &r; } // NOLINT
+  XGBOOST_DEVICE inline const_pointer address(const_reference r) { return &r; } // NOLINT
+
+  inline pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT
+    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
+
+    pointer result(nullptr);
+    dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    return result;
+  }
+
+  inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT
+
+  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT
+
+  XGBOOST_DEVICE inline bool operator==(pinned_allocator const& x) const { return true; }
+
+  XGBOOST_DEVICE inline bool operator!=(pinned_allocator const& x) const {
+    return !operator==(x);
+  }
+};
+}  // namespace cuda
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -62,7 +62,7 @@ void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t,
 #endif  // !defined(XGBOOST_USE_CUDA)

 template <typename T, std::int32_t kDim>
-auto cbegin(TensorView<T, kDim> v) {  // NOLINT
+auto cbegin(TensorView<T, kDim> const& v) {  // NOLINT
  auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
    return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
  });
@@ -70,19 +70,19 @@ auto cbegin(TensorView<T, kDim> v) {  // NOLINT
 }

 template <typename T, std::int32_t kDim>
-auto cend(TensorView<T, kDim> v) {  // NOLINT
+auto cend(TensorView<T, kDim> const& v) {  // NOLINT
  return cbegin(v) + v.Size();
 }

 template <typename T, std::int32_t kDim>
-auto begin(TensorView<T, kDim> v) {  // NOLINT
+auto begin(TensorView<T, kDim>& v) {  // NOLINT
  auto it = common::MakeIndexTransformIter(
      [&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
  return it;
 }

 template <typename T, std::int32_t kDim>
-auto end(TensorView<T, kDim> v) {  // NOLINT
+auto end(TensorView<T, kDim>& v) {  // NOLINT
  return begin(v) + v.Size();
 }
 }  // namespace linalg
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -144,7 +144,7 @@ class PartitionBuilder {
        auto gidx = gidx_calc(ridx);
        bool go_left = default_left;
        if (gidx > -1) {
-          go_left = Decision(node_cats, cut_values[gidx], default_left);
+          go_left = Decision(node_cats, cut_values[gidx]);
        }
        return go_left;
      } else {
@@ -157,7 +157,7 @@ class PartitionBuilder {
      bool go_left = default_left;
      if (gidx > -1) {
        if (is_cat) {
-          go_left = Decision(node_cats, cut_values[gidx], default_left);
+          go_left = Decision(node_cats, cut_values[gidx]);
        } else {
          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
        }
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2021 by Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
 * \file array_interface.h
 * \brief View of __array_interface__
 */
@@ -7,9 +7,11 @@
 #define XGBOOST_DATA_ARRAY_INTERFACE_H_

 #include <algorithm>
-#include <cinttypes>
+#include <cstddef>  // std::size_t
+#include <cstdint>
 #include <map>
 #include <string>
+#include <type_traits>  // std::alignment_of,std::remove_pointer_t
 #include <utility>
 #include <vector>

@@ -101,7 +103,7 @@ class ArrayInterfaceHandler {
  template <typename PtrType>
  static PtrType GetPtrFromArrayData(Object::Map const &obj) {
    auto data_it = obj.find("data");
-    if (data_it == obj.cend()) {
+    if (data_it == obj.cend() || IsA<Null>(data_it->second)) {
      LOG(FATAL) << "Empty data passed in.";
    }
    auto p_data = reinterpret_cast<PtrType>(
@@ -111,7 +113,7 @@ class ArrayInterfaceHandler {

  static void Validate(Object::Map const &array) {
    auto version_it = array.find("version");
-    if (version_it == array.cend()) {
+    if (version_it == array.cend() || IsA<Null>(version_it->second)) {
      LOG(FATAL) << "Missing `version' field for array interface";
    }
    if (get<Integer const>(version_it->second) > 3) {
@@ -119,17 +121,19 @@ class ArrayInterfaceHandler {
    }

    auto typestr_it = array.find("typestr");
-    if (typestr_it == array.cend()) {
+    if (typestr_it == array.cend() || IsA<Null>(typestr_it->second)) {
      LOG(FATAL) << "Missing `typestr' field for array interface";
    }

    auto typestr = get<String const>(typestr_it->second);
    CHECK(typestr.size() == 3 || typestr.size() == 4) << ArrayInterfaceErrors::TypestrFormat();

-    if (array.find("shape") == array.cend()) {
+    auto shape_it = array.find("shape");
+    if (shape_it == array.cend() || IsA<Null>(shape_it->second)) {
      LOG(FATAL) << "Missing `shape' field for array interface";
    }
-    if (array.find("data") == array.cend()) {
+    auto data_it = array.find("data");
+    if (data_it == array.cend() || IsA<Null>(data_it->second)) {
      LOG(FATAL) << "Missing `data' field for array interface";
    }
  }
@@ -139,8 +143,9 @@ class ArrayInterfaceHandler {
  static size_t ExtractMask(Object::Map const &column,
                            common::Span<RBitField8::value_type> *p_out) {
    auto &s_mask = *p_out;
-    if (column.find("mask") != column.cend()) {
-      auto const &j_mask = get<Object const>(column.at("mask"));
+    auto const &mask_it = column.find("mask");
+    if (mask_it != column.cend() && !IsA<Null>(mask_it->second)) {
+      auto const &j_mask = get<Object const>(mask_it->second);
      Validate(j_mask);

      auto p_mask = GetPtrFromArrayData<RBitField8::value_type *>(j_mask);
@@ -173,8 +178,9 @@ class ArrayInterfaceHandler {
      // assume 1 byte alignment.
      size_t const span_size = RBitField8::ComputeStorageSize(n_bits);

-      if (j_mask.find("strides") != j_mask.cend()) {
-        auto strides = get<Array const>(column.at("strides"));
+      auto strides_it = j_mask.find("strides");
+      if (strides_it != j_mask.cend() && !IsA<Null>(strides_it->second)) {
+        auto strides = get<Array const>(strides_it->second);
        CHECK_EQ(strides.size(), 1) << ArrayInterfaceErrors::Dimension(1);
        CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ArrayInterfaceErrors::Contiguous();
      }
@@ -390,6 +396,11 @@ class ArrayInterface {

    data = ArrayInterfaceHandler::ExtractData(array, n);
    static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported.");
+
+    auto alignment = this->ElementAlignment();
+    auto ptr = reinterpret_cast<uintptr_t>(this->data);
+    CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment.";
+
    if (allow_mask) {
      common::Span<RBitField8::value_type> s_mask;
      size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
@@ -401,7 +412,9 @@ class ArrayInterface {
                            << "XGBoost doesn't support internal broadcasting.";
      }
    } else {
-      CHECK(array.find("mask") == array.cend()) << "Masked array is not yet supported.";
+      auto mask_it = array.find("mask");
+      CHECK(mask_it == array.cend() || IsA<Null>(mask_it->second))
+          << "Masked array is not yet supported.";
    }

    auto stream_it = array.find("stream");
@@ -506,9 +519,15 @@ class ArrayInterface {
    return func(reinterpret_cast<uint64_t const *>(data));
  }

-  XGBOOST_DEVICE size_t ElementSize() {
-    return this->DispatchCall(
-        [](auto *p_values) { return sizeof(std::remove_pointer_t<decltype(p_values)>); });
+  XGBOOST_DEVICE std::size_t ElementSize() const {
+    return this->DispatchCall([](auto *typed_data_ptr) {
+      return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
+    });
+  }
+  XGBOOST_DEVICE std::size_t ElementAlignment() const {
+    return this->DispatchCall([](auto *typed_data_ptr) {
+      return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
+    });
  }

  template <typename T = float, typename... Index>
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM

 GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                   double sparse_thresh, bool sorted_sketch, int32_t n_threads,
-                                   common::Span<float> hess) {
+                                   common::Span<float> hess)
+    : max_numeric_bins_per_feat{max_bins_per_feat} {
  CHECK(p_fmat->SingleColBlock());
  // We use sorted sketching for approx tree method since it's more efficient in
  // computation time (but higher memory usage).
  cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);

-  max_num_bins = max_bins_per_feat;
  const uint32_t nbins = cut.Ptrs().back();
  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
    : row_ptr(info.num_row_ + 1, 0),
      hit_count(cuts.TotalBins(), 0),
      cut{std::forward<common::HistogramCuts>(cuts)},
-      max_num_bins(max_bin_per_feat),
+      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}

 #if !defined(XGBOOST_USE_CUDA)
@@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
 }

 GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
-                                   common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
-                                   bool isDense, double sparse_thresh, int32_t n_threads) {
+                                   common::HistogramCuts cuts, int32_t max_bins_per_feat,
+                                   bool isDense, double sparse_thresh, int32_t n_threads)
+    : cut{std::move(cuts)},
+      max_numeric_bins_per_feat{max_bins_per_feat},
+      base_rowid{batch.base_rowid},
+      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
-  base_rowid = batch.base_rowid;
-  isDense_ = isDense;
-  cut = cuts;
-  max_num_bins = max_bins_per_feat;
  CHECK_EQ(row_ptr.size(), 0);
  // The number of threads is pegged to the batch size. If the OMP
  // block is parallelized on anything other than the batch/block size,
@@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
+      isDense) {
    // compress dense index to uint8
    index.SetBinTypeSize(common::kUint8BinsTypeSize);
    index.Resize((sizeof(uint8_t)) * n_index);
-  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
-              max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
+  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
+              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
    index.SetBinTypeSize(common::kUint16BinsTypeSize);
@@ -148,10 +149,28 @@ common::ColumnMatrix const &GHistIndexMatrix::Transpose() const {
  return *columns_;
 }

+bst_bin_t GHistIndexMatrix::GetGindex(size_t ridx, size_t fidx) const {
+  auto begin = RowIdx(ridx);
+  if (IsDense()) {
+    return static_cast<bst_bin_t>(index[begin + fidx]);
+  }
+  auto end = RowIdx(ridx + 1);
+  auto const& cut_ptrs = cut.Ptrs();
+  auto f_begin = cut_ptrs[fidx];
+  auto f_end = cut_ptrs[fidx + 1];
+  return BinarySearchBin(begin, end, index, f_begin, f_end);
+}
+
 float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
  auto const &values = cut.Values();
  auto const &mins = cut.MinValues();
  auto const &ptrs = cut.Ptrs();
+  return this->GetFvalue(ptrs, values, mins, ridx, fidx, is_cat);
+}
+
+float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
+                                  std::vector<float> const &values, std::vector<float> const &mins,
+                                  bst_row_t ridx, bst_feature_t fidx, bool is_cat) const {
  if (is_cat) {
    auto f_begin = ptrs[fidx];
    auto f_end = ptrs[fidx + 1];
@@ -171,24 +190,27 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
    }
    return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
  };
-
-  if (columns_->GetColumnType(fidx) == common::kDenseColumn) {
-    if (columns_->AnyMissing()) {
+  switch (columns_->GetColumnType(fidx)) {
+    case common::kDenseColumn: {
+      if (columns_->AnyMissing()) {
+        return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
+          auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
+          return get_bin_val(column);
+        });
+      } else {
+        return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
+          auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
+          auto bin_idx = column[ridx];
+          return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
+        });
+      }
+    }
+    case common::kSparseColumn: {
      return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-        auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
-        return get_bin_val(column);
-      });
-    } else {
-      return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-        auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
+        auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
        return get_bin_val(column);
      });
    }
-  } else {
-    return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-      auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
-      return get_bin_val(column);
-    });
  }

  SPAN_CHECK(false);
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,

 GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
                                   EllpackPage const& in_page, BatchParam const& p)
-    : max_num_bins{p.max_bin} {
+    : max_numeric_bins_per_feat{p.max_bin} {
  auto page = in_page.Impl();
  isDense_ = page->is_dense;

--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -133,11 +133,15 @@ class GHistIndexMatrix {
  std::vector<size_t> hit_count;
  /*! \brief The corresponding cuts */
  common::HistogramCuts cut;
-  /*! \brief max_bin for each feature. */
-  bst_bin_t max_num_bins;
+  /** \brief max_bin for each feature. */
+  bst_bin_t max_numeric_bins_per_feat;
  /*! \brief base row index for current page (used by external memory) */
  size_t base_rowid{0};

+  bst_bin_t MaxNumBinPerFeat() const {
+    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
+  }
+
  ~GHistIndexMatrix();
  /**
   * \brief Constrcutor for SimpleDMatrix.
@@ -160,7 +164,7 @@ class GHistIndexMatrix {
   * \brief Constructor for external memory.
   */
  GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
-                   common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
+                   common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
                   double sparse_thresh, int32_t n_threads);
  GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.

@@ -223,7 +227,12 @@ class GHistIndexMatrix {

  common::ColumnMatrix const& Transpose() const;

+  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+
  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
+                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
+                  bool is_cat) const;

 private:
  std::unique_ptr<common::ColumnMatrix> columns_;
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
    if (!fi->Read(&page->hit_count)) {
      return false;
    }
-    if (!fi->Read(&page->max_num_bins)) {
+    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
      return false;
    }
    if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
        sizeof(uint64_t);
    // max_bins, base row, is_dense
-    fo->Write(page.max_num_bins);
-    bytes += sizeof(page.max_num_bins);
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
    fo->Write(page.base_rowid);
    bytes += sizeof(page.base_rowid);
    fo->Write(page.IsDense());
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -58,6 +58,13 @@ void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, Bat
    }
  };
  auto ellpack = [&]() {
+    // workaround ellpack being initialized from CPU.
+    if (p.gpu_id == Context::kCpuId) {
+      p.gpu_id = ref_->Ctx()->gpu_id;
+    }
+    if (p.gpu_id == Context::kCpuId) {
+      p.gpu_id = 0;
+    }
    for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
      GetCutsFromEllpack(page, p_cuts);
      break;
@@ -103,6 +110,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
      size_t n_threads = ctx_.Threads();
      size_t n_features = column_sizes.size();
      linalg::Tensor<size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      column_sizes_tloc.Data()->Fill(0);
      auto view = column_sizes_tloc.HostView();
      common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
        auto const& line = value.GetLine(i);
@@ -172,9 +180,9 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
    size_t i = 0;
    while (iter.Next()) {
      if (!p_sketch) {
-        p_sketch.reset(new common::HostSketchContainer{batch_param_.max_bin,
-                                                       proxy->Info().feature_types.ConstHostSpan(),
-                                                       column_sizes, false, ctx_.Threads()});
+        p_sketch.reset(new common::HostSketchContainer{
+            batch_param_.max_bin, proxy->Info().feature_types.ConstHostSpan(), column_sizes,
+            !proxy->Info().group_ptr_.empty(), ctx_.Threads()});
      }
      HostAdapterDispatch(proxy, [&](auto const& batch) {
        proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -42,6 +42,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
    out->Info() = this->Info().Slice(ridxs);
    out->Info().num_nonzero_ = h_offset.back();
  }
+  out->ctx_ = this->ctx_;
  return out;
 }

--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -28,6 +28,7 @@
 #include "xgboost/logging.h"
 #include "xgboost/objective.h"
 #include "xgboost/predictor.h"
+#include "xgboost/string_view.h"
 #include "xgboost/tree_updater.h"

 namespace xgboost {
@@ -395,23 +396,36 @@ void GBTree::LoadConfig(Json const& in) {
  tparam_.process_type = TreeProcessType::kDefault;
  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
-    LOG(WARNING)
-        << "Loading from a raw memory buffer on CPU only machine.  "
-           "Changing predictor to auto.";
+    LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine.  "
+                    "Changing predictor to auto.";
    tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
  }
+
+  auto msg = StringView{
+      R"(
+  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
+  machine. Consider using `save_model/load_model` instead. See:
+
+    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
+
+  for more details about differences between saving model and serializing.)"};
+
  if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) {
    tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}});
-    LOG(WARNING)
-        << "Loading from a raw memory buffer on CPU only machine.  "
-           "Changing tree_method to hist.";
+    LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
  }

  auto const& j_updaters = get<Object const>(in["updater"]);
  updaters_.clear();
+
  for (auto const& kv : j_updaters) {
-    std::unique_ptr<TreeUpdater> up(
-        TreeUpdater::Create(kv.first, ctx_, model_.learner_model_param->task));
+    auto name = kv.first;
+    if (n_gpus == 0 && name == "grow_gpu_hist") {
+      name = "grow_quantile_histmaker";
+      LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
+    }
+    std::unique_ptr<TreeUpdater> up{
+        TreeUpdater::Create(name, ctx_, model_.learner_model_param->task)};
    up->LoadConfig(kv.second);
    updaters_.push_back(std::move(up));
  }
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -63,7 +63,7 @@ bst_float PredValue(const SparsePage::Inst &inst,
      psum += (*trees[i])[nidx].LeafValue();
    }
  }
-  p_feats->Drop(inst);
+  p_feats->Drop();
  return psum;
 }

@@ -116,13 +116,11 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
  }
 }

-template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
+              std::vector<RegTree::FVec> *p_feats) {
  for (size_t i = 0; i < block_size; ++i) {
    RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
-    const SparsePage::Inst inst = (*batch)[batch_offset + i];
-    feats.Drop(inst);
+    feats.Drop();
  }
 }

@@ -142,11 +140,15 @@ struct SparsePageView {
 struct GHistIndexMatrixView {
 private:
  GHistIndexMatrix const &page_;
-  uint64_t n_features_;
+  std::uint64_t const n_features_;
  common::Span<FeatureType const> ft_;
  common::Span<Entry> workspace_;
  std::vector<size_t> current_unroll_;

+  std::vector<std::uint32_t> const& ptrs_;
+  std::vector<float> const& mins_;
+  std::vector<float> const& values_;
+
 public:
  size_t base_rowid;

@@ -159,6 +161,9 @@ struct GHistIndexMatrixView {
        ft_{ft},
        workspace_{workplace},
        current_unroll_(n_threads > 0 ? n_threads : 1, 0),
+        ptrs_{_page.cut.Ptrs()},
+        mins_{_page.cut.MinValues()},
+        values_{_page.cut.Values()},
        base_rowid{_page.base_rowid} {}

  SparsePage::Inst operator[](size_t r) {
@@ -167,7 +172,7 @@ struct GHistIndexMatrixView {
    size_t non_missing{beg};

    for (bst_feature_t c = 0; c < n_features_; ++c) {
-      float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
+      float f = page_.GetFvalue(ptrs_, values_, mins_, r, c, common::IsCat(ft_, c));
      if (!common::CheckNAN(f)) {
        workspace_[non_missing] = Entry{c, f};
        ++non_missing;
@@ -250,10 +255,9 @@ void PredictBatchByBlockOfRowsKernel(
    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset,
             p_thread_temp);
    // process block of rows through all trees to keep cache locality
-    PredictByAllTrees(model, tree_begin, tree_end, out_preds,
-                      batch_offset + batch.base_rowid, num_group, thread_temp,
-                      fvec_offset, block_size);
-    FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
+    PredictByAllTrees(model, tree_begin, tree_end, out_preds, batch_offset + batch.base_rowid,
+                      num_group, thread_temp, fvec_offset, block_size);
+    FVecDrop(block_size, fvec_offset, p_thread_temp);
  });
 }

@@ -470,7 +474,7 @@ class CPUPredictor : public Predictor {
          bst_node_t tid = GetLeafIndex<true, true>(tree, feats, cats);
          preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
        }
-        feats.Drop(page[i]);
+        feats.Drop();
      });
    }
  }
@@ -544,7 +548,7 @@ class CPUPredictor : public Predictor {
                  (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
            }
          }
-          feats.Drop(page[i]);
+          feats.Drop();
          // add base margin to BIAS
          if (base_margin.Size() != 0) {
            CHECK_EQ(base_margin.Shape(1), ngroup);
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -18,9 +18,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
    if (has_categorical && common::IsCat(cats.split_type, nid)) {
      auto node_categories =
          cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
-      return common::Decision<true>(node_categories, fvalue, node.DefaultLeft())
-                 ? node.LeftChild()
-                 : node.RightChild();
+      return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
    } else {
      return node.LeftChild() + !(fvalue < node.SplitCond());
    }
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -248,8 +248,10 @@ class EvaluateSplitAgent {

 template <int kBlockSize>
 __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
-    bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
-    const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
+    bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs,
+    const EvaluateSplitSharedInputs shared_inputs,
+    common::Span<bst_feature_t> sorted_idx,
    const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
    common::Span<DeviceSplitCandidate> out_candidates) {
  // Aligned && shared storage for best_split
@@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
  __syncthreads();

  // Allocate blocks to one feature of one node
-  const auto input_idx = blockIdx.x / number_active_features;
+  const auto input_idx = blockIdx.x / max_active_features;
  const EvaluateSplitInputs &inputs = d_inputs[input_idx];
  // One block for each feature. Features are sampled, so fidx != blockIdx.x
-
-  int fidx = inputs.feature_set[blockIdx.x % number_active_features];
+  // Some blocks may not have any feature to work on, simply return
+  int feature_offset = blockIdx.x % max_active_features;
+  if (feature_offset >= inputs.feature_set.size()) {
+    return;
+  }
+  int fidx = inputs.feature_set[feature_offset];

  using AgentT = EvaluateSplitAgent<kBlockSize>;
  __shared__ typename AgentT::TempStorage temp_storage;
@@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
 }

 void GPUHistEvaluator::LaunchEvaluateSplits(
-    bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
+    bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs,
    EvaluateSplitSharedInputs shared_inputs,
    TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
    common::Span<DeviceSplitCandidate> out_splits) {
@@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
    this->SortHistogram(d_inputs, shared_inputs, evaluator);
  }

-  size_t combined_num_features = number_active_features * d_inputs.size();
-  dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
+  size_t combined_num_features = max_active_features * d_inputs.size();
+  dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
+      combined_num_features, DeviceSplitCandidate());

  // One block for each feature
  uint32_t constexpr kBlockThreads = 32;
-  dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
-      EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs,
-      shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
+  dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
+                   0}(
+      EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
+      shared_inputs,
+      this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
      evaluator, dh::ToSpan(feature_best_splits));

  // Reduce to get best candidate for left and right child over all features
-  auto reduce_offset = dh::MakeTransformIterator<size_t>(
-      thrust::make_counting_iterator(0llu),
-      [=] __device__(size_t idx) -> size_t { return idx * number_active_features; });
+  auto reduce_offset =
+      dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
+                                        [=] __device__(size_t idx) -> size_t {
+                                          return idx * max_active_features;
+                                        });
  size_t temp_storage_bytes = 0;
  auto num_segments = out_splits.size();
  cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
@@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
 }

 void GPUHistEvaluator::EvaluateSplits(
-    const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
+    const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs,
+    EvaluateSplitSharedInputs shared_inputs,
    common::Span<GPUExpandEntry> out_entries) {
  auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();

  dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
  auto out_splits = dh::ToSpan(splits_out_storage);
-  this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator,
-                             out_splits);
+  this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
+                             evaluator, out_splits);

  auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
  auto d_entries = out_entries;
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -3,10 +3,10 @@
 */
 #ifndef EVALUATE_SPLITS_CUH_
 #define EVALUATE_SPLITS_CUH_
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #include <xgboost/span.h>

 #include "../../common/categorical.h"
+#include "../../common/cuda_pinned_allocator.h"
 #include "../split_evaluator.h"
 #include "../updater_gpu_common.cuh"
 #include "expand_entry.cuh"
@@ -57,7 +57,7 @@ struct CatAccessor {
 class GPUHistEvaluator {
  using CatST = common::CatBitField::value_type;  // categorical storage type
  // use pinned memory to stage the categories, used for sort based splits.
-  using Alloc = thrust::system::cuda::experimental::pinned_allocator<CatST>;
+  using Alloc = xgboost::common::cuda::pinned_allocator<CatST>;

 private:
  TreeEvaluator tree_evaluator_;
@@ -170,13 +170,18 @@ class GPUHistEvaluator {
      TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);

  // impl of evaluate splits, contains CUDA kernels so it's public
-  void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
-                      TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
-                      common::Span<DeviceSplitCandidate> out_splits);
+  void LaunchEvaluateSplits(
+      bst_feature_t max_active_features,
+      common::Span<const EvaluateSplitInputs> d_inputs,
+      EvaluateSplitSharedInputs shared_inputs,
+      TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
+      common::Span<DeviceSplitCandidate> out_splits);
  /**
   * \brief Evaluate splits for left and right nodes.
   */
-  void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
+  void EvaluateSplits(const std::vector<bst_node_t> &nidx,
+                      bst_feature_t max_active_features,
+                      common::Span<const EvaluateSplitInputs> d_inputs,
                      EvaluateSplitSharedInputs shared_inputs,
                      common::Span<GPUExpandEntry> out_splits);
  /**
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -389,6 +389,7 @@ class HistEvaluator {
    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
                             tree[candidate.nid].SplitIndex(), left_weight,
                             right_weight);
+    evaluator = tree_evaluator_.GetEvaluator();

    auto max_node = std::max(left_child, tree[candidate.nid].RightChild());
    max_node = std::max(candidate.nid, max_node);
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -48,6 +48,8 @@ class TreeEvaluator {
      monotone_.HostVector().resize(n_features, 0);
      has_constraint_ = false;
    } else {
+      CHECK_LE(p.monotone_constraints.size(), n_features)
+          << "The size of monotone constraint should be less or equal to the number of features.";
      monotone_.HostVector() = p.monotone_constraints;
      monotone_.HostVector().resize(n_features, 0);
      // Initialised to some small size, can grow if needed
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -188,7 +188,8 @@ struct GPUHistMakerDevice {
  common::Span<GradientPair> gpair;

  dh::device_vector<int> monotone_constraints;
-  dh::device_vector<float> update_predictions;
+  // node idx for each sample
+  dh::device_vector<bst_node_t> positions;

  TrainParam param;

@@ -285,7 +286,7 @@ struct GPUHistMakerDevice {
        matrix.feature_segments,
        matrix.gidx_fvalue_map,
        matrix.min_fvalue,
-        matrix.is_dense
+        matrix.is_dense && !collective::IsDistributed()
    };
    auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs);
    return split;
@@ -299,12 +300,14 @@ struct GPUHistMakerDevice {
    std::vector<bst_node_t> nidx(2 * candidates.size());
    auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
-    EvaluateSplitSharedInputs shared_inputs{
-        GPUTrainingParam{param}, *quantiser, feature_types,     matrix.feature_segments,
-        matrix.gidx_fvalue_map,  matrix.min_fvalue,
-        matrix.is_dense
-    };
+    EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
+                                            matrix.feature_segments, matrix.gidx_fvalue_map,
+                                            matrix.min_fvalue,
+                                            // is_dense represents the local data
+                                            matrix.is_dense && !collective::IsDistributed()};
    dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
+    // Store the feature set ptrs so they dont go out of scope before the kernel is called
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
    for (size_t i = 0; i < candidates.size(); i++) {
      auto candidate = candidates.at(i);
      int left_nidx = tree[candidate.nid].LeftChild();
@@ -313,29 +316,34 @@ struct GPUHistMakerDevice {
      nidx[i * 2 + 1] = right_nidx;
      auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
      left_sampled_features->SetDevice(ctx_->gpu_id);
+      feature_sets.emplace_back(left_sampled_features);
      common::Span<bst_feature_t> left_feature_set =
          interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
      auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
      right_sampled_features->SetDevice(ctx_->gpu_id);
+      feature_sets.emplace_back(right_sampled_features);
      common::Span<bst_feature_t> right_feature_set =
-          interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
-      h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
-                              left_feature_set, hist.GetNodeHistogram(left_nidx)};
-      h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
-                                  right_feature_set, hist.GetNodeHistogram(right_nidx)};
+          interaction_constraints.Query(right_sampled_features->DeviceSpan(),
+                                        right_nidx);
+      h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
+                              candidate.split.left_sum, left_feature_set,
+                              hist.GetNodeHistogram(left_nidx)};
+      h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
+                                  candidate.split.right_sum, right_feature_set,
+                                  hist.GetNodeHistogram(right_nidx)};
    }
-    bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
+    bst_feature_t max_active_features = 0;
    for (auto input : h_node_inputs) {
-      CHECK_EQ(input.feature_set.size(), number_active_features)
-          << "Current implementation assumes that the number of active features "
-             "(after sampling) in any node is the same";
+      max_active_features = std::max(max_active_features,
+                                     bst_feature_t(input.feature_set.size()));
    }
-    dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
-                                  h_node_inputs.size() * sizeof(EvaluateSplitInputs),
-                                  cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(
+        d_node_inputs.data().get(), h_node_inputs.data(),
+        h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));

-    this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
-                                    shared_inputs, dh::ToSpan(entries));
+    this->evaluator_.EvaluateSplits(nidx, max_active_features,
+                                    dh::ToSpan(d_node_inputs), shared_inputs,
+                                    dh::ToSpan(entries));
    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                  cudaMemcpyDeviceToHost));
@@ -403,8 +411,7 @@ struct GPUHistMakerDevice {
            go_left = data.split_node.DefaultLeft();
          } else {
            if (data.split_type == FeatureType::kCategorical) {
-              go_left = common::Decision<false>(data.node_cats.Bits(), cut_value,
-                                                data.split_node.DefaultLeft());
+              go_left = common::Decision(data.node_cats.Bits(), cut_value);
            } else {
              go_left = cut_value <= data.split_node.SplitCond();
            }
@@ -424,7 +431,7 @@ struct GPUHistMakerDevice {
        LOG(FATAL) << "Current objective function can not be used with external memory.";
      }
      p_out_position->Resize(0);
-      update_predictions.clear();
+      positions.clear();
      return;
    }

@@ -459,8 +466,6 @@ struct GPUHistMakerDevice {
                              HostDeviceVector<bst_node_t>* p_out_position) {
    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
    auto d_gpair = this->gpair;
-    update_predictions.resize(row_partitioner->GetRows().size());
-    auto d_update_predictions = dh::ToSpan(update_predictions);
    p_out_position->SetDevice(ctx_->gpu_id);
    p_out_position->Resize(row_partitioner->GetRows().size());

@@ -481,7 +486,7 @@ struct GPUHistMakerDevice {
          if (common::IsCat(d_feature_types, position)) {
            auto node_cats = categories.subspan(categories_segments[position].beg,
                                                categories_segments[position].size);
-            go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
+            go_left = common::Decision(node_cats, element);
          } else {
            go_left = element <= node.SplitCond();
          }
@@ -495,32 +500,45 @@ struct GPUHistMakerDevice {
        node = d_nodes[position];
      }

-      d_update_predictions[row_id] = node.LeafValue();
      return position;
    };  // NOLINT

    auto d_out_position = p_out_position->DeviceSpan();
    row_partitioner->FinalisePosition(d_out_position, new_position_op);

+    auto s_position = p_out_position->ConstDeviceSpan();
+    positions.resize(s_position.size());
+    dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
+                                  s_position.size_bytes(), cudaMemcpyDeviceToDevice));
+
    dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
      bst_node_t position = d_out_position[idx];
-      d_update_predictions[idx] = d_nodes[position].LeafValue();
      bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
      d_out_position[idx] = is_row_sampled ? ~position : position;
    });
  }

  bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
-    if (update_predictions.empty()) {
+    if (positions.empty()) {
      return false;
    }
+
    CHECK(p_tree);
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
-    auto d_update_predictions = dh::ToSpan(update_predictions);
-    CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
-    dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
-      out_preds_d(idx) += d_update_predictions[idx];
+
+    auto d_position = dh::ToSpan(positions);
+    CHECK_EQ(out_preds_d.Size(), d_position.size());
+
+    auto const& h_nodes = p_tree->GetNodes();
+    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
+    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
+                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
+    auto d_nodes = dh::ToSpan(nodes);
+    dh::LaunchN(d_position.size(), [=] XGBOOST_DEVICE(std::size_t idx) mutable {
+      bst_node_t nidx = d_position[idx];
+      auto weight = d_nodes[nidx].LeafValue();
+      out_preds_d(idx) += weight;
    });
    return true;
  }
@@ -863,6 +881,7 @@ class GPUHistMaker : public TreeUpdater {
  std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT

  char const* Name() const override { return "grow_gpu_hist"; }
+  bool HasNodePosition() const override { return true; }

 private:
  bool initialised_{false};
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -78,7 +78,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(

  {
    GradientPairPrecise grad_stat;
-    if (p_fmat->IsDense()) {
+    if (p_fmat->IsDense() && !collective::IsDistributed()) {
      /**
       * Specialized code for dense data: For dense data (with no missing value), the sum
       * of gradient histogram is equal to snode[nid]
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -89,7 +89,7 @@ class TreeRefresher : public TreeUpdater {
                     dmlc::BeginPtr(stemp[tid]) + offset);
            offset += tree->param.num_nodes;
          }
-          feats.Drop(inst);
+          feats.Drop();
        });
      }
      // aggregate the statistics
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -23,10 +23,15 @@ case "${container}" in
  gpu|rmm)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    if [[ $container == "rmm" ]]
+    then
+      BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
+    fi
    ;;

  gpu_build_centos7|jvm_gpu_build)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
    ;;

  *)
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -15,7 +15,8 @@ fi

 command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"

 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -16,7 +16,8 @@ else
 fi

 command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
-                `"CUDA_VERSION_ARG=$CUDA_VERSION"
+                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"

 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/prune_libnccl.sh
--- a/tests/buildkite/build-jvm-packages-gpu.sh
+++ b/tests/buildkite/build-jvm-packages-gpu.sh
@@ -14,5 +14,7 @@ else
 fi

 tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \
-  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_jvm_packages.sh \
+  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
+  --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
+  tests/ci_build/build_jvm_packages.sh \
  ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag}
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -12,10 +12,10 @@ if ( $is_release_branch -eq 0 ) {
 }
 mkdir build
 cd build
-cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_UNITY_BUILD=ON ${arch_flag}
+cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
+  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
 $msbuild = -join @(
-  "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0"
+  "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current"
  "\\Bin\\MSBuild.exe"
 )
 & $msbuild xgboost.sln /m /p:Configuration=Release /nodeReuse:false
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -22,9 +22,10 @@ function set_buildkite_env_vars_in_container {

 set -x

-CUDA_VERSION=11.0.3
-RAPIDS_VERSION=22.10
-SPARK_VERSION=3.0.1
+CUDA_VERSION=11.8.0
+NCCL_VERSION=2.16.5-1
+RAPIDS_VERSION=23.02
+SPARK_VERSION=3.1.1
 JDK_VERSION=8

 if [[ -z ${BUILDKITE:-} ]]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jiaming Yuan	36eb41c960	Bump version to 1.7.6 (#9305 )	2023-06-16 03:33:16 +08:00
Jiaming Yuan	39ddf40a8d	[backport] Optimize prediction with QuantileDMatrix. (#9096 ) (#9303 )	2023-06-15 23:32:03 +08:00
Jiaming Yuan	573f1c7db4	[backport] Fix monotone constraints on CPU. (#9122 ) (#9287 ) * [backport] Fix monotone constraints on CPU. (#9122)	2023-06-11 17:51:25 +08:00
Jiaming Yuan	abc80d2a6d	[backport] Improve doxygen (#8959 ) (#9284 ) * Remove Sphinx build from GH Action * Build Doxygen as part of RTD build * Add jQuery Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2023-06-11 13:22:23 +08:00
Jiaming Yuan	e882fb3262	[backport] [spark] Make spark model have the same UID with its estimator (#9022 ) (#9285 ) Signed-off-by: Weichen Xu <weichen.xu@databricks.com> Co-authored-by: WeichenXu <weichen.xu@databricks.com>	2023-06-11 13:18:23 +08:00
Jiaming Yuan	3218f6cd3c	[backport] Disable dense opt for distributed training. (#9272 ) (#9288 )	2023-06-11 11:08:45 +08:00
Jiaming Yuan	a962611de7	Disable SHAP test on 1.7 (#9290 )	2023-06-11 02:13:36 +08:00
Jiaming Yuan	14476e8868	[backport] Fix tests with pandas 2.0. (#9014 ) (#9289 ) * Fix tests with pandas 2.0. - `is_categorical` is replaced by `is_categorical_dtype`. - one hot encoding returns boolean type instead of integer type.	2023-06-11 00:52:44 +08:00
Jiaming Yuan	03f3879b71	[backport] [doc] fix the cudf installation [skip ci] (#9106 ) (#9286 ) Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2023-06-10 04:09:27 +08:00
Jiaming Yuan	21d95f3d8f	[backport] [doc][R] Update link. (#8998 ) (#9001 )	2023-03-30 20:02:31 +08:00
Jiaming Yuan	5cd4015d70	[backport] Fill column size. (#8997 )	2023-03-30 15:21:42 +08:00
Jiaming Yuan	b8c6b86792	Bump version to 1.7.5. (#8994 )	2023-03-29 21:41:10 +08:00
Jiaming Yuan	1baebe231b	[backport] [CI] Fix Windows wheel to be compatible with Poetry (#8991 ) (#8992 ) * [CI] Fix Windows wheel to be compatible with Poetry Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2023-03-29 14:26:20 +08:00
Jiaming Yuan	365da0b8f4	[backport] [doc] Add missing document for pyspark ranker. (#8692 ) (#8990 )	2023-03-29 12:02:51 +08:00
Jiaming Yuan	f5f03dfb61	[backport] Update dmlc-core to get C++17 deprecation warning (#8855 ) (#8982 ) Co-authored-by: Rong Ou <rong.ou@gmail.com>	2023-03-27 21:31:30 +08:00
Jiaming Yuan	a1c209182d	[backport] Update c++ requirement to 17 for the R package. (#8860 ) (#8983 )	2023-03-27 18:24:25 +08:00
Jiaming Yuan	4be75d852c	[backport] Fix scope of feature set pointers (#8850 ) (#8972 ) --------- Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>	2023-03-27 00:33:08 +08:00
Jiaming Yuan	ba50e6eb62	[backport] [CI] Require C++17 + CMake 3.18; Use CUDA 11.8 in CI (#8853 ) (#8971 ) Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2023-03-26 00:10:03 +08:00
Jiaming Yuan	36ad160501	Bump version to 1.7.4. (#8805 )	2023-02-16 06:40:01 +08:00
Jiaming Yuan	c22f6db4bf	[backport] Fix CPU bin compression with categorical data. (#8809 ) (#8810 ) * [backport] Fix CPU bin compression with categorical data. (#8809) * Fix CPU bin compression with categorical data. * The bug causes the maximum category to be lesser than 256 or the maximum number of bins when the input data is dense. * Avoid test symbol.	2023-02-16 06:39:25 +08:00
Jiaming Yuan	f15a6d2b19	[backport] Fix ranking with quantile dmatrix and group weight. (#8762 ) (#8800 ) * [backport] Fix ranking with quantile dmatrix and group weight. (#8762) * backport test utilities.	2023-02-15 02:45:09 +08:00
Jiaming Yuan	08a547f5c2	[backport] Fix feature types param (#8772 ) (#8801 ) Signed-off-by: Weichen Xu <weichen.xu@databricks.com> Co-authored-by: WeichenXu <weichen.xu@databricks.com>	2023-02-15 01:39:20 +08:00
Jiaming Yuan	60303db2ee	[backport] Fix GPU L1 error. (#8749 ) (#8770 ) * [backport] Fix GPU L1 error. (#8749) * Fix backport.	2023-02-09 20:16:39 +08:00
Jiaming Yuan	df984f9c43	[backport] Fix different number of features in gpu_hist evaluator. (#8754 ) (#8769 ) Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>	2023-02-09 18:31:49 +08:00
Jiaming Yuan	2f22f8d49b	[backport] Make sure input numpy array is aligned. (#8690 ) (#8696 ) (#8734 ) * [backport] Make sure input numpy array is aligned. (#8690) - use `np.require` to specify that the alignment is required. - scipy csr as well. - validate input pointer in `ArrayInterface`. * Workaround CUDA warning. (#8696) * backport from half type support for alignment. * fix import.	2023-02-06 16:58:15 +08:00
Jiaming Yuan	68d86336d7	[backport] [R] fix OpenMP detection on macOS (#8684 ) (#8732 ) Co-authored-by: James Lamb <jaylamb20@gmail.com>	2023-01-29 12:43:10 +08:00
Jiaming Yuan	76bdca072a	[R] Fix threads used to create DMatrix in predict. (#8681 ) (#8682 )	2023-01-15 04:00:31 +08:00
Jiaming Yuan	021e6a842a	[backport] [R] Get CXX flags from R CMD config. (#8669 ) (#8680 )	2023-01-14 18:46:59 +08:00
Jiaming Yuan	e5bef4ffce	[backport] Fix threads in DMatrix slice. (#8667 ) (#8679 )	2023-01-14 18:46:04 +08:00
Jiaming Yuan	10bb0a74ef	[backport] [CI] Skip pyspark sparse tests. (#8675 ) (#8678 )	2023-01-14 06:40:17 +08:00
Jiaming Yuan	e803d06d8c	[backport] [R] Remove unused assert definition. (#8526 ) (#8668 )	2023-01-13 04:55:29 +08:00
Jiaming Yuan	ccf43d4ba0	Bump R package version to 1.7.3. (#8649 )	2023-01-06 20:34:05 +08:00
Jiaming Yuan	dd58c2ac47	Bump version to 1.7.3. (#8646 )	2023-01-06 17:55:51 +08:00
Jiaming Yuan	899e4c8988	[backport] Do not return internal value for `get_params`. (#8634 ) (#8642 )	2023-01-06 02:28:39 +08:00
Jiaming Yuan	a2085bf223	[backport] Fix loading GPU pickle with a CPU-only xgboost distribution. (#8632 ) (#8641 ) We can handle loading the pickle on a CPU-only machine if the XGBoost is built with CUDA enabled (Linux and Windows PyPI package), but not if the distribution is CPU-only (macOS PyPI package).	2023-01-06 02:28:21 +08:00
Jiaming Yuan	067b704e58	[backport] Fix inference with categorical feature. (#8591 ) (#8602 ) (#8638 ) * Fix inference with categorical feature. (#8591) * Fix windows build on buildkite. (#8602) * workaround.	2023-01-06 01:17:49 +08:00
Jiaming Yuan	1a834b2b85	Fix linalg iterator. (#8603 ) (#8639 )	2023-01-05 23:16:10 +08:00
Jiaming Yuan	162b48a1a4	[backport] [CI] Disable gtest with RMM (#8620 ) (#8640 ) Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>	2023-01-05 23:13:45 +08:00
Jiaming Yuan	83a078b7e5	[backport] Fix sklearn test that calls a removed field (#8579 ) (#8636 ) Co-authored-by: Rong Ou <rong.ou@gmail.com>	2023-01-05 21:17:05 +08:00
Jiaming Yuan	575fba651b	[backport] [CI] Fix CI with updated dependencies. (#8631 ) (#8635 )	2023-01-05 19:10:58 +08:00
Jiaming Yuan	62ed8b5fef	Bump release version to 1.7.2. (#8569 )	2022-12-08 21:46:26 +08:00
Jiaming Yuan	a980e10744	Properly await async method client.wait_for_workers (#8558 ) (#8567 ) * Properly await async method client.wait_for_workers * ignore mypy error. Co-authored-by: jiamingy <jm.yuan@outlook.com> Co-authored-by: Matthew Rocklin <mrocklin@gmail.com>	2022-12-07 23:25:05 +08:00
Jiaming Yuan	59c54e361b	[pyspark] Make QDM optional based on cuDF check (#8471 ) (#8556 ) Co-authored-by: WeichenXu <weichen.xu@databricks.com>	2022-12-07 03:19:35 +08:00
Jiaming Yuan	60a8c8ebba	[pyspark] sort qid for SparkRanker (#8497 ) (#8555 ) * [pyspark] sort qid for SparkRandker * resolve comments Co-authored-by: Bobby Wang <wbo4958@gmail.com>	2022-12-07 02:07:37 +08:00
Jiaming Yuan	58bc225657	[backport] [CI] Fix github action mismatched glibcxx. (#8551 ) (#8552 ) Split up the Linux test to use the toolchain from conda forge.	2022-12-06 21:35:26 +08:00
Jiaming Yuan	850b53100f	[backport] [doc] Fix outdated document [skip ci] (#8527 ) (#8553 ) * [doc] Fix document around categorical parameters. [skip ci] * note on validate parameter [skip ci] * Fix dask doc as well [skip ci]	2022-12-06 18:21:14 +08:00
Philip Hyunsu Cho	67b657dad0	SO_DOMAIN do not support on IBM i, using getsockname instead (#8437 ) (#8500 )	2022-11-30 11:47:59 -08:00
Philip Hyunsu Cho	db14e3feb7	Support null value in CUDA array interface. (#8486 ) (#8499 )	2022-11-30 11:44:54 -08:00
Robert Maynard	9372370dda	Work with newer thrust and libcudacxx (#8432 ) * Thrust 1.17 removes the experimental/pinned_allocator. When xgboost is brought into a large project it can be compiled against Thrust 1.17+ which don't offer this experimental allocator. To ensure that going forward xgboost works in all environments we provide a xgboost namespaced version of the pinned_allocator that previously was in Thrust. * Update gputreeshap to work with libcudacxx 1.9	2022-11-11 01:15:25 +08:00
Jiaming Yuan	1136a7e0c3	Fix CRAN note on cleanup. (#8447 )	2022-11-09 14:22:54 +08:00
Jiaming Yuan	a347cd512b	[backport] [R] Fix CRAN test notes. (#8428 ) (#8440 ) - Limit the number of used CPU cores in examples. - Add a note for the constraint. - Bring back the cleanup script.	2022-11-09 07:12:46 +08:00
Jiaming Yuan	9ff0c0832a	Fix 1.7.1 version file. (#8427 )	2022-11-06 03:19:54 +08:00
@@ -1 +1 @@
 .7.1
 .7.6