Make 1.4.2 patch release. (#6962 )

[back port] Fix multiclass auc with empty dataset. (#6947 ) (#6960 )
[CI] Fix CI/CD pipeline broken by latest auditwheel (4.0.0) (#6951 ) (#6952 )
2021-05-13 16:17:14 +08:00 · 2021-05-13 12:31:52 +08:00 · 2021-05-11 20:45:04 +08:00 · 2021-05-04 12:43:42 -07:00 · 2021-05-04 16:10:16 +08:00 · 2021-05-04 16:09:49 +08:00
274 changed files with 13003 additions and 4799 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -6,9 +6,6 @@ name: XGBoost-CI
 # events but only for the master branch
 on: [push, pull_request]

-env:
-  R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
-
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
  gtest-cpu:
@@ -34,7 +31,10 @@ jobs:
    - name: Run gtest binary
      run: |
        cd build
-        ctest --extra-verbose
+        # libomp internal error:
+        #   OMP: Error #131: Thread identifier invalid.
+        ./testxgboost  --gtest_filter="-HistIndexCreationWithExternalMemory.Test"
+        ctest -R TestXGBoostCLI --extra-verbose

  gtest-cpu-nonomp:
    name: Test Google C++ unittest (CPU Non-OMP)
@@ -62,6 +62,45 @@ jobs:
        cd build
        ctest --extra-verbose

+  python-sdist-test:
+    name: Test installing XGBoost Python source package
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-10.15, windows-latest]
+        python-version: ["3.8"]
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - name: Install osx system dependencies
+      if: matrix.os == 'macos-10.15'
+      run: |
+        brew install ninja libomp
+    - name: Install Ubuntu system dependencies
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get install -y --no-install-recommends ninja-build
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        auto-update-conda: true
+        python-version: ${{ matrix.python-version }}
+        activate-environment: test
+    - name: Display Conda env
+      shell: bash -l {0}
+      run: |
+        conda info
+        conda list
+    - name: Build and install XGBoost
+      shell: bash -l {0}
+      run: |
+        cd python-package
+        python --version
+        python setup.py sdist
+        pip install -v ./dist/xgboost-*.tar.gz
+        cd ..
+        python -c 'import xgboost'
+
  c-api-demo:
    name: Test installing XGBoost lib + building the C API demo
    runs-on: ${{ matrix.os }}
@@ -81,6 +120,7 @@ jobs:
      with:
        auto-update-conda: true
        python-version: ${{ matrix.python-version }}
+        activate-environment: test
    - name: Display Conda env
      shell: bash -l {0}
      run: |
@@ -117,10 +157,20 @@ jobs:
      with:
        submodules: 'true'

+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+        architecture: 'x64'
+
    - uses: actions/setup-java@v1
      with:
        java-version: 1.8

+    - name: Install Python packages
+      run: |
+        python -m pip install wheel setuptools
+        python -m pip install awscli
+
    - name: Cache Maven packages
      uses: actions/cache@v2
      with:
@@ -133,6 +183,28 @@ jobs:
        cd jvm-packages
        mvn test -B -pl :xgboost4j_2.12

+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+      if: |
+        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
+        matrix.os == 'windows-latest'
+
+    - name: Publish artifact xgboost4j.dll to S3
+      run: |
+        cd lib/
+        Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
+        dir
+        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
+      if: |
+        (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
+        matrix.os == 'windows-latest'
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+
+
    - name: Test XGBoost4J-Spark
      run: |
        rm -rfv build/
@@ -161,6 +233,24 @@ jobs:
      run: |
        make lint

+  mypy:
+    runs-on: ubuntu-latest
+    name: Type checking for Python
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install Python packages
+      run: |
+        python -m pip install wheel setuptools mypy dask[complete] distributed
+    - name: Run mypy
+      run: |
+        make mypy
+
  doxygen:
    runs-on: ubuntu-latest
    name: Generate C/C++ API doc using Doxygen
@@ -192,7 +282,7 @@ jobs:
      run: |
        cd build/
        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
-        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/ --acl public-read
+        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
@@ -207,7 +297,7 @@ jobs:
        submodules: 'true'
    - uses: actions/setup-python@v2
      with:
-        python-version: '3.7'
+        python-version: '3.8'
        architecture: 'x64'
    - name: Install system packages
      run: |
@@ -224,133 +314,3 @@ jobs:
        make -C doc html
      env:
        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
-
-  lintr:
-    runs-on: ${{ matrix.config.os }}
-    name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
-    strategy:
-      matrix:
-        config:
-          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
-    env:
-      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
-      RSPM: ${{ matrix.config.rspm }}
-
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'true'
-
-    - uses: r-lib/actions/setup-r@master
-      with:
-        r-version: ${{ matrix.config.r }}
-
-    - name: Cache R packages
-      uses: actions/cache@v2
-      with:
-        path: ${{ env.R_LIBS_USER }}
-        key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }}
-        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2-
-
-    - name: Install dependencies
-      shell: Rscript {0}
-      run: |
-        install.packages(${{ env.R_PACKAGES }},
-                         repos = 'http://cloud.r-project.org',
-                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
-
-    - name: Run lintr
-      run: |
-        cd R-package
-        R.exe CMD INSTALL .
-        Rscript.exe tests/helper_scripts/run_lint.R
-
-  test-with-R:
-    runs-on: ${{ matrix.config.os }}
-    name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'}
-          - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'}
-          - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'}
-    env:
-      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
-      RSPM: ${{ matrix.config.rspm }}
-
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'true'
-
-    - uses: r-lib/actions/setup-r@master
-      with:
-        r-version: ${{ matrix.config.r }}
-
-    - name: Cache R packages
-      uses: actions/cache@v2
-      with:
-        path: ${{ env.R_LIBS_USER }}
-        key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }}
-        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2-
-
-    - name: Install dependencies
-      shell: Rscript {0}
-      run: |
-        install.packages(${{ env.R_PACKAGES }},
-                         repos = 'http://cloud.r-project.org',
-                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
-
-    - uses: actions/setup-python@v2
-      with:
-        python-version: '3.7'
-        architecture: 'x64'
-
-    - name: Test R
-      run: |
-        python tests/ci_build/test_r_package.py --compiler="${{ matrix.config.compiler }}" --build-tool="${{ matrix.config.build }}"
-
-  test-R-CRAN:
-    runs-on: ubuntu-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - {r: 'release'}
-
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'true'
-
-    - uses: r-lib/actions/setup-r@master
-      with:
-        r-version: ${{ matrix.config.r }}
-
-    - uses: r-lib/actions/setup-tinytex@master
-
-    - name: Cache R packages
-      uses: actions/cache@v2
-      with:
-        path: ${{ env.R_LIBS_USER }}
-        key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }}
-        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2-
-
-    - name: Install system packages
-      run: |
-        sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev
-
-    - name: Install dependencies
-      shell: Rscript {0}
-      run: |
-        install.packages(${{ env.R_PACKAGES }},
-                         repos = 'http://cloud.r-project.org',
-                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
-
-    - name: Check R Package
-      run: |
-        # Print stacktrace upon success of failure
-        make Rcheck || tests/ci_build/print_r_stacktrace.sh fail
-        tests/ci_build/print_r_stacktrace.sh success
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -0,0 +1,116 @@
+name: XGBoost-R-Tests
+
+on: [push, pull_request]
+
+env:
+  R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
+
+jobs:
+  lintr:
+    runs-on: ${{ matrix.config.os }}
+    name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
+    strategy:
+      matrix:
+        config:
+          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
+    env:
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      RSPM: ${{ matrix.config.rspm }}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+
+    - uses: r-lib/actions/setup-r@master
+      with:
+        r-version: ${{ matrix.config.r }}
+
+    - name: Install dependencies
+      shell: Rscript {0}
+      run: |
+        install.packages(${{ env.R_PACKAGES }},
+                         repos = 'http://cloud.r-project.org',
+                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
+
+    - name: Run lintr
+      run: |
+        cd R-package
+        R.exe CMD INSTALL .
+        Rscript.exe tests/helper_scripts/run_lint.R
+
+  test-with-R:
+    runs-on: ${{ matrix.config.os }}
+    name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'}
+          - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'}
+          - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'}
+    env:
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      RSPM: ${{ matrix.config.rspm }}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+
+    - uses: r-lib/actions/setup-r@master
+      with:
+        r-version: ${{ matrix.config.r }}
+
+    - name: Install dependencies
+      shell: Rscript {0}
+      run: |
+        install.packages(${{ env.R_PACKAGES }},
+                         repos = 'http://cloud.r-project.org',
+                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
+
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+
+    - name: Test R
+      run: |
+        python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool='${{ matrix.config.build }}'
+
+  test-R-CRAN:
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {r: 'release'}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+
+    - uses: r-lib/actions/setup-r@master
+      with:
+        r-version: ${{ matrix.config.r }}
+
+    - uses: r-lib/actions/setup-tinytex@master
+
+    - name: Install system packages
+      run: |
+        sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc
+
+    - name: Install dependencies
+      shell: Rscript {0}
+      run: |
+        install.packages(${{ env.R_PACKAGES }},
+                         repos = 'http://cloud.r-project.org',
+                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
+
+    - name: Check R Package
+      run: |
+        # Print stacktrace upon success of failure
+        make Rcheck || tests/ci_build/print_r_stacktrace.sh fail
+        tests/ci_build/print_r_stacktrace.sh success
--- a/.gitignore
+++ b/.gitignore
@@ -115,3 +115,13 @@ dask-worker-space/

 # Jupyter notebook checkpoints
 .ipynb_checkpoints/
+
+# credentials and key material
+config
+credentials
+credentials.csv
+*.env
+*.pem
+*.pub
+*.rdp
+*_rsa
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,7 @@
 [submodule "dmlc-core"]
 	path = dmlc-core
 	url = https://github.com/dmlc/dmlc-core
+	branch = main
 [submodule "cub"]
 	path = cub
 	url = https://github.com/NVlabs/cub
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,49 +9,24 @@ env:

 jobs:
  include:
-    - os: linux
-      arch: amd64
-      env: TASK=python_sdist_test
-    - os: linux
-      arch: arm64
-      env: TASK=python_sdist_test
-    - os: linux
-      arch: arm64
-      env: TASK=python_test
-      services:
-        - docker
    - os: osx
      arch: amd64
      osx_image: xcode10.2
      env: TASK=python_test
-    - os: osx
-      arch: amd64
-      osx_image: xcode10.2
-      env: TASK=python_sdist_test
    - os: osx
      arch: amd64
      osx_image: xcode10.2
      env: TASK=java_test
-    - os: linux
-      arch: s390x
-      env: TASK=s390x_test

 # dependent brew packages
+# the dependencies from homebrew is installed manually from setup script due to outdated image from travis.
 addons:
  homebrew:
-    packages:
-      - cmake
-      - libomp
-      - graphviz
-      - openssl
-      - libgit2
-      - lz4
-      - wget
-      - r
-    update: true
+    update: false
  apt:
    packages:
      - snapd
+      - unzip

 before_install:
  - source tests/travis/travis_setup_env.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.13)
-project(xgboost LANGUAGES CXX C VERSION 1.3.0)
+project(xgboost LANGUAGES CXX C VERSION 1.4.2)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
@@ -203,7 +203,7 @@ endif (HIDE_CXX_SYMBOLS)

 target_include_directories(xgboost
  INTERFACE
-  $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>
+  $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>)

 # This creates its own shared library `xgboost4j'.
--- a/77
+++ b/77
@@ -56,6 +56,7 @@ pipeline {
          parallel ([
            'clang-tidy': { ClangTidy() },
            'build-cpu': { BuildCPU() },
+            'build-cpu-arm64': { BuildCPUARM64() },
            'build-cpu-rabit-mock': { BuildCPUMock() },
            // Build reference, distribution-ready Python wheel with CUDA 10.0
            // using CentOS 6 image
@@ -64,6 +65,7 @@ pipeline {
            'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
            'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },
            'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
+            'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '10.0') },
            'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
            'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
            'build-jvm-doc': { BuildJVMDoc() }
@@ -77,6 +79,7 @@ pipeline {
        script {
          parallel ([
            'test-python-cpu': { TestPythonCPU() },
+            'test-python-cpu-arm64': { TestPythonCPUARM64() },
            // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env
            'test-python-gpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', test_rmm: true) },
            'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') },
@@ -164,6 +167,35 @@ def BuildCPU() {
  }
 }

+def BuildCPUARM64() {
+  node('linux && arm64') {
+    unstash name: 'srcs'
+    echo "Build CPU ARM64"
+    def container_type = "aarch64"
+    def docker_binary = "docker"
+    def wheel_tag = "manylinux2014_aarch64"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOL=ON
+    ${dockerRun} ${container_type} ${docker_binary} bash -c "cd build && ctest --extra-verbose"
+    ${dockerRun} ${container_type} ${docker_binary} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+    ${dockerRun} ${container_type} ${docker_binary} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} ${wheel_tag}
+    ${dockerRun} ${container_type} ${docker_binary} bash -c "auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl && python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag}"
+    mv -v wheelhouse/*.whl python-package/dist/
+    # Make sure that libgomp.so is vendored in the wheel
+    ${dockerRun} ${container_type} ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
+    """
+    echo 'Stashing Python wheel...'
+    stash name: "xgboost_whl_arm64_cpu", includes: 'python-package/dist/*.whl'
+    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
+      echo 'Uploading Python wheel...'
+      path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
+      s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
+    }
+    stash name: 'xgboost_cli_arm64', includes: 'xgboost'
+    deleteDir()
+  }
+}
+
 def BuildCPUMock() {
  node('linux && cpu') {
    unstash name: 'srcs'
@@ -190,11 +222,21 @@ def BuildCUDA(args) {
    if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
      arch_flag = "-DGPU_COMPUTE_VER=75"
    }
+    def wheel_tag = "manylinux2010_x86_64"
    sh """
    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOLS=ON ${arch_flag}
    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
-    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} ${wheel_tag}
    """
+    if (args.cuda_version == ref_cuda_ver) {
+      sh """
+      ${dockerRun} auditwheel_x86_64 ${docker_binary} auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag}
+      mv -v wheelhouse/*.whl python-package/dist/
+      # Make sure that libgomp.so is vendored in the wheel
+      ${dockerRun} auditwheel_x86_64 ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp  || exit -1"
+      """
+    }
    echo 'Stashing Python wheel...'
    stash name: "xgboost_whl_cuda${args.cuda_version}", includes: 'python-package/dist/*.whl'
    if (args.cuda_version == ref_cuda_ver && (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release'))) {
@@ -224,6 +266,24 @@ def BuildCUDA(args) {
  }
 }

+def BuildRPackageWithCUDA(args) {
+  node('linux && cpu_build') {
+    unstash name: 'srcs'
+    def container_type = 'gpu_build_r_centos6'
+    def docker_binary = "docker"
+    def docker_args = "--build-arg CUDA_VERSION_ARG=10.0"
+    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
+      sh """
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_r_pkg_with_cuda.sh ${commit_id}
+      """
+      echo 'Uploading R tarball...'
+      path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
+      s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', includePathPattern:'xgboost_r_gpu_linux_*.tar.gz'
+    }
+    deleteDir()
+  }
+}
+
 def BuildJVMPackagesWithCUDA(args) {
  node('linux && mgpu') {
    unstash name: 'srcs'
@@ -295,6 +355,21 @@ def TestPythonCPU() {
  }
 }

+def TestPythonCPUARM64() {
+  node('linux && arm64') {
+    unstash name: "xgboost_whl_arm64_cpu"
+    unstash name: 'srcs'
+    unstash name: 'xgboost_cli_arm64'
+    echo "Test Python CPU ARM64"
+    def container_type = "aarch64"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu-arm64
+    """
+    deleteDir()
+  }
+}
+
 def TestPythonGPU(args) {
  def nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
  def artifact_cuda_version = (args.artifact_cuda_version) ?: ref_cuda_ver
--- a/9
+++ b/9
@@ -86,6 +86,15 @@ cover: check
 	)
 endif

+
+# dask is required to pass, others are not
+# If any of the dask tests failed, contributor won't see the other error.
+mypy:
+	cd python-package; \
+	mypy ./xgboost/dask.py ../tests/python/test_with_dask.py --follow-imports=silent; \
+	mypy ../tests/python-gpu/test_gpu_with_dask.py --follow-imports=silent; \
+	mypy . || true ;
+
 clean:
 	$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o #xgboost
 	$(RM) -rf build_tests *.gcov tests/cpp/xgboost_test
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,258 @@ XGBoost Change Log

 This file records the changes in xgboost library in reverse chronological order.

+## v1.3.0 (2020.12.08)
+
+### XGBoost4J-Spark: Exceptions should cancel jobs gracefully instead of killing SparkContext (#6019).
+* By default, exceptions in XGBoost4J-Spark causes the whole SparkContext to shut down, necessitating the restart of the Spark cluster. This behavior is often a major inconvenience.
+* Starting from 1.3.0 release, XGBoost adds a new parameter `killSparkContextOnWorkerFailure` to optionally prevent killing SparkContext. If this parameter is set, exceptions will gracefully cancel training jobs instead of killing SparkContext.
+
+### GPUTreeSHAP: GPU acceleration of the TreeSHAP algorithm (#6038, #6064, #6087, #6099, #6163, #6281, #6332)
+* [SHAP (SHapley Additive exPlanations)](https://github.com/slundberg/shap) is a game theoretic approach to explain predictions of machine learning models. It computes feature importance scores for individual examples, establishing how each feature influences a particular prediction. TreeSHAP is an optimized SHAP algorithm specifically designed for decision tree ensembles.
+* Starting with 1.3.0 release, it is now possible to leverage CUDA-capable GPUs to accelerate the TreeSHAP algorithm. Check out [the demo notebook](https://github.com/dmlc/xgboost/blob/master/demo/gpu_acceleration/shap.ipynb).
+* The CUDA implementation of the TreeSHAP algorithm is hosted at [rapidsai/GPUTreeSHAP](https://github.com/rapidsai/gputreeshap). XGBoost imports it as a Git submodule.
+
+### New style Python callback API (#6199, #6270, #6320, #6348, #6376, #6399, #6441)
+* The XGBoost Python package now offers a re-designed callback API. The new callback API lets you design various extensions of training in idomatic Python. In addition, the new callback API allows you to use early stopping with the native Dask API (`xgboost.dask`). Check out [the tutorial](https://xgboost.readthedocs.io/en/release_1.3.0/python/callbacks.html) and [the demo](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/callbacks.py).
+
+### Enable the use of `DeviceQuantileDMatrix` / `DaskDeviceQuantileDMatrix` with large data (#6201, #6229, #6234).
+* `DeviceQuantileDMatrix` can achieve memory saving by avoiding extra copies of the training data, and the saving is bigger for large data. Unfortunately, large data with more than 2^31 elements was triggering integer overflow bugs in CUB and Thrust. Tracking issue: #6228.
+* This release contains a series of work-arounds to allow the use of `DeviceQuantileDMatrix` with large data:
+  - Loop over `copy_if` (#6201)
+  - Loop over `thrust::reduce` (#6229)
+  - Implement the inclusive scan algorithm in-house, to handle large offsets (#6234)
+
+### Support slicing of tree models (#6302)
+* Accessing the best iteration of a model after the application of early stopping used to be error-prone, need to manually pass the `ntree_limit` argument to the `predict()` function.
+* Now we provide a simple interface to slice tree models by specifying a range of boosting rounds. The tree ensemble can be split into multiple sub-ensembles via the slicing interface. Check out [an example](https://xgboost.readthedocs.io/en/release_1.3.0/python/model.html).
+* In addition, the early stopping callback now supports `save_best` option. When enabled, XGBoost will save (persist) the model at the best boosting round and discard the trees that were fit subsequent to the best round.
+
+### Weighted subsampling of features (columns) (#5962)
+* It is now possible to sample features (columns) via weighted subsampling, in which features with higher weights are more likely to be selected in the sample. Weighted subsampling allows you to encode domain knowledge by emphasizing a particular set of features in the choice of tree splits. In addition, you can prevent particular features from being used in any splits, by assigning them zero weights.
+* Check out [the demo](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/feature_weights.py).
+
+### Improved integration with Dask
+* Support reverse-proxy environment such as Google Kubernetes Engine (#6343, #6475)
+* An XGBoost training job will no longer use all available workers. Instead, it will only use the workers that contain input data (#6343).
+* The new callback API works well with the Dask training API.
+* The `predict()` and `fit()` function of `DaskXGBClassifier` and `DaskXGBRegressor` now accept a base margin (#6155).
+* Support more meta data in the Dask API (#6130, #6132, #6333).
+* Allow passing extra keyword arguments as `kwargs` in `predict()` (#6117)
+* Fix typo in dask interface: `sample_weights` -> `sample_weight` (#6240)
+* Allow empty data matrix in AFT survival, as Dask may produce empty partitions (#6379)
+* Speed up prediction by overlapping prediction jobs in all workers (#6412)
+
+### Experimental support for direct splits with categorical features (#6028, #6128, #6137, #6140, #6164, #6165, #6166, #6179, #6194, #6219)
+* Currently, XGBoost requires users to one-hot-encode categorical variables. This has adverse performance implications, as the creation of many dummy variables results into higher memory consumption and may require fitting deeper trees to achieve equivalent model accuracy.
+* The 1.3.0 release of XGBoost contains an experimental support for direct handling of categorical variables in test nodes. Each test node will have the condition of form `feature_value \in match_set`, where the `match_set` on the right hand side contains one or more matching categories. The matching categories in `match_set` represent the condition for traversing to the right child node. Currently, XGBoost will only generate categorical splits with only a single matching category ("one-vs-rest split"). In a future release, we plan to remove this restriction and produce splits with multiple matching categories in `match_set`.
+* The categorical split requires the use of JSON model serialization. The legacy binary serialization method cannot be used to save (persist) models with categorical splits.
+* Note. This feature is currently highly experimental. Use it at your own risk. See the detailed list of limitations at [#5949](https://github.com/dmlc/xgboost/pull/5949).
+
+### Experimental plugin for RAPIDS Memory Manager (#5873, #6131, #6146, #6150, #6182)
+* RAPIDS Memory Manager library ([rapidsai/rmm](https://github.com/rapidsai/rmm)) provides a collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin. With this plugin, XGBoost is now able to share a common GPU memory pool with other applications using RMM, such as the RAPIDS data science packages.
+* See [the demo](https://github.com/dmlc/xgboost/blob/master/demo/rmm_plugin/README.md) for a working example, as well as directions for building XGBoost with the RMM plugin.
+* The plugin will be soon considered non-experimental, once #6297 is resolved.
+
+### Experimental plugin for oneAPI programming model (#5825)
+* oneAPI is a programming interface developed by Intel aimed at providing one programming model for many types of hardware such as CPU, GPU, FGPA and other hardware accelerators.
+* XGBoost now includes an experimental plugin for using oneAPI for the predictor and objective functions. The plugin is hosted in the directory `plugin/updater_oneapi`.
+* Roadmap: #5442
+
+### Pickling the XGBoost model will now trigger JSON serialization (#6027)
+* The pickle will now contain the JSON string representation of the XGBoost model, as well as related configuration.
+
+### Performance improvements
+* Various performance improvement on multi-core CPUs
+  - Optimize DMatrix build time by up to 3.7x. (#5877)
+  - CPU predict performance improvement, by up to 3.6x. (#6127)
+  - Optimize CPU sketch allreduce for sparse data (#6009)
+  - Thread local memory allocation for BuildHist, leading to speedup up to 1.7x. (#6358)
+  - Disable hyperthreading for DMatrix creation (#6386). This speeds up DMatrix creation by up to 2x.
+  - Simple fix for static shedule in predict (#6357)
+* Unify thread configuration, to make it easy to utilize all CPU cores (#6186)
+* [jvm-packages] Clean the way deterministic paritioning is computed (#6033)
+* Speed up JSON serialization by implementing an intrusive pointer class (#6129). It leads to 1.5x-2x performance boost.
+
+### API additions
+* [R] Add SHAP summary plot using ggplot2 (#5882)
+* Modin DataFrame can now be used as input (#6055)
+* [jvm-packages] Add `getNumFeature` method (#6075)
+* Add MAPE metric (#6119)
+* Implement GPU predict leaf. (#6187)
+* Enable cuDF/cuPy inputs in `XGBClassifier` (#6269)
+* Document tree method for feature weights. (#6312)
+* Add `fail_on_invalid_gpu_id` parameter, which will cause XGBoost to terminate upon seeing an invalid value of `gpu_id` (#6342)
+
+### Breaking: the default evaluation metric for classification is changed to `logloss` / `mlogloss` (#6183)
+* The default metric used to be accuracy, and it is not statistically consistent to perform early stopping with the accuracy metric when we are really optimizing the log loss for the `binary:logistic` objective.
+* For statistical consistency, the default metric for classification has been changed to `logloss`. Users may choose to preserve the old behavior by explicitly specifying `eval_metric`.
+
+### Breaking: `skmaker` is now removed (#5971)
+* The `skmaker` updater has not been documented nor tested.
+
+### Breaking: the JSON model format no longer stores the leaf child count (#6094).
+* The leaf child count field has been deprecated and is not used anywhere in the XGBoost codebase.
+
+### Breaking: XGBoost now requires MacOS 10.14 (Mojave) and later.
+* Homebrew has dropped support for MacOS 10.13 (High Sierra), so we are not able to install the OpenMP runtime (`libomp`) from Homebrew on MacOS 10.13. Please use MacOS 10.14 (Mojave) or later.
+
+### Deprecation notices
+* The use of `LabelEncoder` in `XGBClassifier` is now deprecated and will be removed in the next minor release (#6269). The deprecation is necessary to support multiple types of inputs, such as cuDF data frames or cuPy arrays.
+* The use of certain positional arguments in the Python interface is deprecated (#6365). Users will use deprecation warnings for the use of position arguments for certain function parameters. New code should use keyword arguments as much as possible. We have not yet decided when we will fully require the use of keyword arguments.
+
+### Bug-fixes
+* On big-endian arch, swap the byte order in the binary serializer to enable loading models that were produced by a little-endian machine (#5813).
+* [jvm-packages] Fix deterministic partitioning with dataset containing Double.NaN (#5996)
+* Limit tree depth for GPU hist to 31 to prevent integer overflow (#6045)
+* [jvm-packages] Set `maxBins` to 256 to align with the default value in the C++ code (#6066)
+* [R] Fix CRAN check (#6077)
+* Add back support for `scipy.sparse.coo_matrix` (#6162)
+* Handle duplicated values in sketching. (#6178)
+* Catch all standard exceptions in C API. (#6220)
+* Fix linear GPU input (#6255)
+* Fix inplace prediction interval. (#6259)
+* [R] allow `xgb.plot.importance()` calls to fill a grid (#6294)
+* Lazy import dask libraries. (#6309)
+* Deterministic data partitioning for external memory (#6317)
+* Avoid resetting seed for every configuration. (#6349)
+* Fix label errors in graph visualization (#6369)
+* [jvm-packages] fix potential unit test suites aborted issue due to race condition (#6373)
+* [R] Fix warnings from `R check --as-cran` (#6374)
+* [R] Fix a crash that occurs with noLD R (#6378)
+* [R] Do not convert continuous labels to factors (#6380)
+* [R] remove uses of `exists()` (#6387)
+* Propagate parameters to the underlying `Booster` handle from `XGBClassifier.set_param` / `XGBRegressor.set_param`. (#6416)
+* [R] Fix R package installation via CMake (#6423)
+* Enforce row-major order in cuPy array (#6459)
+* Fix filtering callable objects in the parameters passed to the scikit-learn API. (#6466)
+
+### Maintenance: Testing, continuous integration, build system
+* [CI] Improve JVM test in GitHub Actions (#5930)
+* Refactor plotting test so that it can run independently (#6040)
+* [CI] Cancel builds on subsequent pushes (#6011)
+* Fix Dask Pytest fixture (#6024)
+* [CI] Migrate linters to GitHub Actions (#6035)
+* [CI] Remove win2016 JVM test from GitHub Actions (#6042)
+* Fix CMake build with `BUILD_STATIC_LIB` option (#6090)
+* Don't link imported target in CMake (#6093)
+* Work around a compiler bug in MacOS AppleClang 11 (#6103)
+* [CI] Fix CTest by running it in a correct directory (#6104)
+* [R] Check warnings explicitly for model compatibility tests (#6114)
+* [jvm-packages] add xgboost4j-gpu/xgboost4j-spark-gpu module to facilitate release (#6136)
+* [CI] Time GPU tests. (#6141)
+* [R] remove warning in configure.ac (#6152)
+* [CI] Upgrade cuDF and RMM to 0.16 nightlies; upgrade to Ubuntu 18.04 (#6157)
+* [CI] Test C API demo (#6159)
+* Option for generating device debug info. (#6168)
+* Update `.gitignore` (#6175, #6193, #6346)
+* Hide C++ symbols from dmlc-core (#6188)
+* [CI] Added arm64 job in Travis-CI (#6200)
+* [CI] Fix Docker build for CUDA 11 (#6202)
+* [CI] Move non-OpenMP gtest to GitHub Actions (#6210)
+* [jvm-packages] Fix up build for xgboost4j-gpu, xgboost4j-spark-gpu (#6216)
+* Add more tests for categorical data support (#6219)
+* [dask] Test for data initializaton. (#6226)
+* Bump junit from 4.11 to 4.13.1 in /jvm-packages/xgboost4j (#6230)
+* Bump junit from 4.11 to 4.13.1 in /jvm-packages/xgboost4j-gpu (#6233)
+* [CI] Reduce testing load with RMM (#6249)
+* [CI] Build a Python wheel for aarch64 platform (#6253)
+* [CI] Time the CPU tests on Jenkins. (#6257)
+* [CI] Skip Dask tests on ARM. (#6267)
+* Fix a typo in `is_arm()` in testing.py (#6271)
+* [CI] replace `egrep` with `grep -E` (#6287)
+* Support unity build. (#6295)
+* [CI] Mark flaky tests as XFAIL (#6299)
+* [CI] Use separate Docker cache for each CUDA version (#6305)
+* Added `USE_NCCL_LIB_PATH` option to enable user to set `NCCL_LIBRARY` during build  (#6310)
+* Fix flaky data initialization test. (#6318)
+* Add a badge for GitHub Actions (#6321)
+* Optional `find_package` for sanitizers. (#6329)
+* Use pytest conventions consistently in Python tests (#6337)
+* Fix missing space in warning message (#6340)
+* Update `custom_metric_obj.rst` (#6367)
+* [CI] Run R check with `--as-cran` flag on GitHub Actions (#6371)
+* [CI] Remove R check from Jenkins (#6372)
+* Mark GPU external memory test as XFAIL. (#6381)
+* [CI] Add noLD R test (#6382)
+* Fix MPI build. (#6403)
+* [CI] Upgrade to MacOS Mojave image (#6406)
+* Fix flaky sparse page dmatrix test. (#6417)
+* [CI] Upgrade cuDF and RMM to 0.17 nightlies (#6434)
+* [CI] Fix CentOS 6 Docker images (#6467)
+* [CI] Vendor libgomp in the manylinux Python wheel (#6461)
+* [CI] Hot fix for libgomp vendoring (#6482)
+
+### Maintenance: Clean up and merge the Rabit submodule (#6023, #6095, #6096, #6105, #6110, #6262, #6275, #6290)
+* The Rabit submodule is now maintained as part of the XGBoost codebase.
+* Tests for Rabit are now part of the test suites of XGBoost.
+* Rabit can now be built on the Windows platform.
+* We made various code re-formatting for the C++ code with clang-tidy.
+* Public headers of XGBoost no longer depend on Rabit headers.
+* Unused CMake targets for Rabit were removed.
+* Single-point model recovery has been dropped and removed from Rabit, simplifying the Rabit code greatly. The single-point model recovery feature has not been adequately maintained over the years.
+* We removed the parts of Rabit that were not useful for XGBoost.
+
+### Maintenance: Refactor code for legibility and maintainability
+* Unify CPU hist sketching (#5880)
+* [R] fix uses of 1:length(x) and other small things (#5992)
+* Unify evaluation functions. (#6037)
+* Make binary bin search reusable. (#6058)
+* Unify set index data. (#6062)
+* [R] Remove `stringi` dependency (#6109)
+* Merge extract cuts into QuantileContainer. (#6125)
+* Reduce C++ compiler warnings (#6197, #6198, #6213, #6286, #6325)
+* Cleanup Python code. (#6223)
+* Small cleanup to evaluator. (#6400)
+
+### Usability Improvements, Documentation
+* [jvm-packages] add example to handle missing value other than 0 (#5677)
+* Add DMatrix usage examples to the C API demo (#5854)
+* List `DaskDeviceQuantileDMatrix` in the doc. (#5975)
+* Update Python custom objective demo. (#5981)
+* Update the JSON model schema to document more objective functions. (#5982)
+* [Python] Fix warning when `missing` field is not used. (#5969)
+* Fix typo in tracker logging (#5994)
+* Move a warning about empty dataset, so that it's shown for all objectives and metrics (#5998)
+* Fix the instructions for installing the nightly build. (#6004)
+* [Doc] Add dtreeviz as a showcase example of integration with 3rd-party software (#6013)
+* [jvm-packages] [doc] Update install doc for JVM packages (#6051)
+* Fix typo in `xgboost.callback.early_stop` docstring (#6071)
+* Add cache suffix to the files used in the external memory demo. (#6088)
+* [Doc] Document the parameter `kill_spark_context_on_worker_failure` (#6097)
+* Fix link to the demo for custom objectives (#6100)
+* Update Dask doc. (#6108)
+* Validate weights are positive values. (#6115)
+* Document the updated CMake version requirement. (#6123)
+* Add demo for `DaskDeviceQuantileDMatrix`. (#6156)
+* Cosmetic fixes in `faq.rst` (#6161)
+* Fix error message. (#6176)
+* [Doc] Add list of winning solutions in data science competitions using XGBoost (#6177)
+* Fix a comment in demo to use correct reference (#6190)
+* Update the list of winning solutions using XGBoost (#6192)
+* Consistent style for build status badge (#6203)
+* [Doc] Add info on GPU compiler (#6204)
+* Update the list of winning solutions (#6222, #6254)
+* Add link to XGBoost's Twitter handle (#6244)
+* Fix minor typos in XGBClassifier methods' docstrings (#6247)
+* Add sponsors link to FUNDING.yml (#6252)
+* Group CLI demo into subdirectory. (#6258)
+* Reduce warning messages from `gbtree`. (#6273)
+* Create a tutorial for using the C API in a C/C++ application (#6285)
+* Update plugin instructions for CMake build (#6289)
+* [doc] make Dask distributed example copy-pastable (#6345)
+* [Python] Add option to use `libxgboost.so` from the system path (#6362)
+* Fixed few grammatical mistakes in doc (#6393)
+* Fix broken link in CLI doc (#6396)
+* Improve documentation for the Dask API (#6413)
+* Revise misleading exception information: no such param of `allow_non_zero_missing` (#6418)
+* Fix CLI ranking demo. (#6439)
+* Fix broken links. (#6455)
+
+### Acknowledgement
+**Contributors**: Nan Zhu (@CodingCat), @FelixYBW, Jack Dunn (@JackDunnNZ), Jean Lescut-Muller (@JeanLescut),  Boris Feld (@Lothiraldan), Nikhil Choudhary (@Nikhil1O1), Rory Mitchell (@RAMitchell), @ShvetsKS, Anthony D'Amato (@Totoketchup), @Wittty-Panda, neko (@akiyamaneko), Alexander Gugel (@alexanderGugel), @dependabot[bot], DIVYA CHAUHAN (@divya661), Daniel Steinberg (@dstein64), Akira Funahashi (@funasoul), Philip Hyunsu Cho (@hcho3), Tong He (@hetong007), Hristo Iliev (@hiliev), Honza Sterba (@honzasterba), @hzy001, Igor Moura (@igormp), @jameskrach, James Lamb (@jameslamb), Naveed Ahmed Saleem Janvekar (@janvekarnaveed), Kyle Nicholson (@kylejn27), lacrosse91 (@lacrosse91), Christian Lorentzen (@lorentzenchr), Manikya Bardhan (@manikyabard), @nabokovas, John Quitto-Graham (@nvidia-johnq), @odidev, Qi Zhang (@qzhang90), Sergio Gavilán (@sgavil), Tanuja Kirthi Doddapaneni (@tanuja3), Cuong Duong (@tcuongd), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), vcarpani (@vcarpani), Vladislav Epifanov (@vepifanov), Vitalie Spinu (@vspinu), Bobby Wang (@wbo4958), Zeno Gantner (@zenogantner), zhang_jf (@zuston)
+
+**Reviewers**: Nan Zhu (@CodingCat), John Zedlewski (@JohnZed), Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Anthony D'Amato (@Totoketchup), @Wittty-Panda, Alexander Gugel (@alexanderGugel), Codecov Comments Bot (@codecov-commenter), Codecov (@codecov-io), DIVYA CHAUHAN (@divya661), Devin Robison (@drobison00), Geoffrey Blake (@geoffreyblake), Mark Harris (@harrism), Philip Hyunsu Cho (@hcho3), Honza Sterba (@honzasterba), Igor Moura (@igormp), @jakirkham, @jameskrach, James Lamb (@jameslamb), Janakarajan Natarajan (@janaknat), Jake Hemstad (@jrhemstad), Keith Kraus (@kkraus14), Kyle Nicholson (@kylejn27), Christian Lorentzen (@lorentzenchr), Michael Mayer (@mayer79), Nikolay Petrov (@napetrov), @odidev, PSEUDOTENSOR / Jonathan McKinney (@pseudotensor), Qi Zhang (@qzhang90), Sergio Gavilán (@sgavil), Scott Lundberg (@slundberg), Cuong Duong (@tcuongd), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), vcarpani (@vcarpani), Vladislav Epifanov (@vepifanov), Vincent Nijs (@vnijs), Vitalie Spinu (@vspinu), Bobby Wang (@wbo4958), William Hicks (@wphicks)
+
 ## v1.2.0 (2020.08.22)

 ### XGBoost4J-Spark now supports the GPU algorithm (#5171)
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.3.0.1
+Version: 1.4.2.1
 Date: 2020-08-28
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
@@ -53,7 +53,6 @@ Suggests:
    testthat,
    lintr,
    igraph (>= 1.0.1),
-    jsonlite,
    float,
    crayon,
    titanic
@@ -64,5 +63,6 @@ Imports:
    methods,
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
+    jsonlite (>= 1.0),
 RoxygenNote: 7.1.1
 SystemRequirements: GNU make, C++14
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -36,6 +36,7 @@ export(xgb.create.features)
 export(xgb.cv)
 export(xgb.dump)
 export(xgb.gblinear.history)
+export(xgb.get.config)
 export(xgb.ggplot.deepness)
 export(xgb.ggplot.importance)
 export(xgb.ggplot.shap.summary)
@@ -52,6 +53,7 @@ export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
 export(xgb.serialize)
+export(xgb.set.config)
 export(xgb.train)
 export(xgb.unserialize)
 export(xgboost)
@@ -78,6 +80,8 @@ importFrom(graphics,lines)
 importFrom(graphics,par)
 importFrom(graphics,points)
 importFrom(graphics,title)
+importFrom(jsonlite,fromJSON)
+importFrom(jsonlite,toJSON)
 importFrom(magrittr,"%>%")
 importFrom(stats,median)
 importFrom(stats,predict)
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -11,6 +11,7 @@ xgb.Booster.handle <- function(params = list(), cachelist = list(),
    if (typeof(modelfile) == "character") {
      ## A filename
      handle <- .Call(XGBoosterCreate_R, cachelist)
+      modelfile <- path.expand(modelfile)
      .Call(XGBoosterLoadModel_R, handle, modelfile[1])
      class(handle) <- "xgb.Booster.handle"
      if (length(params) > 0) {
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -15,8 +15,7 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@@ -27,6 +26,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, ...)
    if (length(data) > 1)
      stop("'data' has class 'character' and length ", length(data),
           ".\n  'data' accepts either a numeric matrix or a single filename.")
+    data <- path.expand(data)
    handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
  } else if (is.matrix(data)) {
    handle <- .Call(XGDMatrixCreateFromMat_R, data, missing)
@@ -65,6 +65,7 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
      warning("xgboost: label will be ignored.")
    }
    if (is.character(data)) {
+      data <- path.expand(data)
      dtrain <- xgb.DMatrix(data[1])
    } else if (inherits(data, "xgb.DMatrix")) {
      dtrain <- data
@@ -171,8 +172,7 @@ dimnames.xgb.DMatrix <- function(x) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #'
 #' labels <- getinfo(dtrain, 'label')
 #' setinfo(dtrain, 'label', 1-labels)
@@ -224,8 +224,7 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #'
 #' labels <- getinfo(dtrain, 'label')
 #' setinfo(dtrain, 'label', 1-labels)
@@ -290,8 +289,7 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #'
 #' dsub <- slice(dtrain, 1:42)
 #' labels1 <- getinfo(dsub, 'label')
@@ -347,8 +345,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #'
 #' dtrain
 #' print(dtrain, verbose=TRUE)
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -7,8 +7,7 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' train <- agaricus.train
-#' dtrain <- xgb.DMatrix(train$data, label=train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@@ -19,6 +18,7 @@ xgb.DMatrix.save <- function(dmatrix, fname) {
  if (!inherits(dmatrix, "xgb.DMatrix"))
    stop("dmatrix must be xgb.DMatrix")

+  fname <- path.expand(fname)
  .Call(XGDMatrixSaveBinary_R, dmatrix, fname[1], 0L)
  return(TRUE)
 }
--- a/R-package/R/xgb.config.R
+++ b/R-package/R/xgb.config.R
@@ -0,0 +1,38 @@
+#' Global configuration consists of a collection of parameters that can be applied in the global
+#' scope. See \url{https://xgboost.readthedocs.io/en/stable/parameter.html} for the full list of
+#' parameters supported in the global configuration. Use \code{xgb.set.config} to update the
+#' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
+#' values of all global-scope parameters (listed in
+#' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
+#'
+#' @rdname xgbConfig
+#' @title Set and get global configuration
+#' @name xgb.set.config, xgb.get.config
+#' @export xgb.set.config xgb.get.config
+#' @param ... List of parameters to be set, as keyword arguments
+#' @return
+#' \code{xgb.set.config} returns \code{TRUE} to signal success. \code{xgb.get.config} returns
+#' a list containing all global-scope parameters and their values.
+#'
+#' @examples
+#' # Set verbosity level to silent (0)
+#' xgb.set.config(verbosity = 0)
+#' # Now global verbosity level is 0
+#' config <- xgb.get.config()
+#' print(config$verbosity)
+#' # Set verbosity level to warning (1)
+#' xgb.set.config(verbosity = 1)
+#' # Now global verbosity level is 1
+#' config <- xgb.get.config()
+#' print(config$verbosity)
+xgb.set.config <- function(...) {
+  new_config <- list(...)
+  .Call(XGBSetGlobalConfig_R, jsonlite::toJSON(new_config, auto_unbox = TRUE))
+  return(TRUE)
+}
+
+#' @rdname xgbConfig
+xgb.get.config <- function() {
+  config <- .Call(XGBGetGlobalConfig_R)
+  return(jsonlite::fromJSON(config))
+}
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -48,8 +48,8 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
-#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
-#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label))
 #'
 #' param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
 #' nrounds = 4
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -112,7 +112,7 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
 #' cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
 #'                   max_depth = 3, eta = 1, objective = "binary:logistic")
 #' print(cv)
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -66,6 +66,7 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
  if (is.null(fname)) {
    return(model_dump)
  } else {
+    fname <- path.expand(fname)
    writeLines(model_dump, fname[1])
    return(TRUE)
  }
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -42,6 +42,7 @@ xgb.save <- function(model, fname) {
         if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
  }
  model <- xgb.Booster.complete(model, saveraw = FALSE)
+  fname <- path.expand(fname)
  .Call(XGBoosterSaveModel_R, model$handle, fname[1])
  return(TRUE)
 }
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -15,7 +15,7 @@
 #'
 #' 2. Booster Parameters
 #'
-#' 2.1. Parameter for Tree Booster
+#' 2.1. Parameters for Tree Booster
 #'
 #' \itemize{
 #'   \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
@@ -24,12 +24,14 @@
 #'   \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
 #'   \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1
 #'   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
+#'   \item \code{lambda} L2 regularization term on weights. Default: 1
+#'   \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
 #'   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
 #'   \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.
 #'   \item \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions. Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. Feature index values should start from \code{0} (\code{0} references the first column).  Leave argument unspecified for no interaction constraints.
 #' }
 #'
-#' 2.2. Parameter for Linear Booster
+#' 2.2. Parameters for Linear Booster
 #'
 #' \itemize{
 #'   \item \code{lambda} L2 regularization term on weights. Default: 0
@@ -193,8 +195,8 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
-#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-#' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
+#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label))
 #' watchlist <- list(train = dtrain, eval = dtest)
 #'
 #' ## A simple xgb.train example:
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -91,6 +91,8 @@ NULL
 #' @importFrom data.table setkeyv
 #' @importFrom data.table setnames
 #' @importFrom magrittr %>%
+#' @importFrom jsonlite fromJSON
+#' @importFrom jsonlite toJSON
 #' @importFrom utils object.size str tail
 #' @importFrom stats predict
 #' @importFrom stats median
--- a/R-package/man/xgbConfig.Rd
+++ b/R-package/man/xgbConfig.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.config.R
+\name{xgb.set.config, xgb.get.config}
+\alias{xgb.set.config, xgb.get.config}
+\alias{xgb.set.config}
+\alias{xgb.get.config}
+\title{Set and get global configuration}
+\usage{
+xgb.set.config(...)
+
+xgb.get.config()
+}
+\arguments{
+\item{...}{List of parameters to be set, as keyword arguments}
+}
+\value{
+\code{xgb.set.config} returns \code{TRUE} to signal success. \code{xgb.get.config} returns
+a list containing all global-scope parameters and their values.
+}
+\description{
+Global configuration consists of a collection of parameters that can be applied in the global
+scope. See \url{https://xgboost.readthedocs.io/en/stable/parameter.html} for the full list of
+parameters supported in the global configuration. Use \code{xgb.set.config} to update the
+values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
+values of all global-scope parameters (listed in
+\url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
+}
+\examples{
+# Set verbosity level to silent (0)
+xgb.set.config(verbosity = 0)
+# Now global verbosity level is 0
+config <- xgb.get.config()
+print(config$verbosity)
+# Set verbosity level to warning (1)
+xgb.set.config(verbosity = 1)
+# Now global verbosity level is 1
+config <- xgb.get.config()
+print(config$verbosity)
+}
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -43,6 +43,8 @@ extern SEXP XGDMatrixNumRow_R(SEXP);
 extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
+extern SEXP XGBSetGlobalConfig_R(SEXP);
+extern SEXP XGBGetGlobalConfig_R();

 static const R_CallMethodDef CallEntries[] = {
  {"XGBoosterBoostOneIter_R",     (DL_FUNC) &XGBoosterBoostOneIter_R,     4},
@@ -73,6 +75,8 @@ static const R_CallMethodDef CallEntries[] = {
  {"XGDMatrixSaveBinary_R",       (DL_FUNC) &XGDMatrixSaveBinary_R,       3},
  {"XGDMatrixSetInfo_R",          (DL_FUNC) &XGDMatrixSetInfo_R,          3},
  {"XGDMatrixSliceDMatrix_R",     (DL_FUNC) &XGDMatrixSliceDMatrix_R,     2},
+  {"XGBSetGlobalConfig_R",        (DL_FUNC) &XGBSetGlobalConfig_R,        1},
+  {"XGBGetGlobalConfig_R",        (DL_FUNC) &XGBGetGlobalConfig_R,        0},
  {NULL, NULL, 0}
 };

--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2014 by Contributors
 #include <dmlc/logging.h>
 #include <dmlc/omp.h>
+#include <dmlc/common.h>
 #include <xgboost/c_api.h>
 #include <vector>
 #include <string>
@@ -49,6 +50,21 @@ void _DMatrixFinalizer(SEXP ext) {
  R_API_END();
 }

+SEXP XGBSetGlobalConfig_R(SEXP json_str) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBSetGlobalConfig(CHAR(asChar(json_str))));
+  R_API_END();
+  return R_NilValue;
+}
+
+SEXP XGBGetGlobalConfig_R() {
+  const char* json_str;
+  R_API_BEGIN();
+  CHECK_CALL(XGBGetGlobalConfig(&json_str));
+  R_API_END();
+  return mkString(json_str);
+}
+
 SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
  SEXP ret;
  R_API_BEGIN();
@@ -77,12 +93,16 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
    din = REAL(mat);
  }
  std::vector<float> data(nrow * ncol);
+  dmlc::OMPException exc;
  #pragma omp parallel for schedule(static)
  for (omp_ulong i = 0; i < nrow; ++i) {
-    for (size_t j = 0; j < ncol; ++j) {
-      data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
-    }
+    exc.Run([&]() {
+      for (size_t j = 0; j < ncol; ++j) {
+        data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
+      }
+    });
  }
+  exc.Rethrow();
  DMatrixHandle handle;
  CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
@@ -111,11 +131,15 @@ SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
  for (size_t i = 0; i < nindptr; ++i) {
    col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
  }
+  dmlc::OMPException exc;
  #pragma omp parallel for schedule(static)
  for (int64_t i = 0; i < static_cast<int64_t>(ndata); ++i) {
-    indices_[i] = static_cast<unsigned>(p_indices[i]);
-    data_[i] = static_cast<float>(p_data[i]);
+    exc.Run([&]() {
+      indices_[i] = static_cast<unsigned>(p_indices[i]);
+      data_[i] = static_cast<float>(p_data[i]);
+    });
  }
+  exc.Rethrow();
  DMatrixHandle handle;
  CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
                                      BeginPtr(data_), nindptr, ndata,
@@ -160,12 +184,16 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
  R_API_BEGIN();
  int len = length(array);
  const char *name = CHAR(asChar(field));
+  dmlc::OMPException exc;
  if (!strcmp("group", name)) {
    std::vector<unsigned> vec(len);
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < len; ++i) {
-      vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
+      exc.Run([&]() {
+        vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
+      });
    }
+    exc.Rethrow();
    CHECK_CALL(XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle),
                                    CHAR(asChar(field)),
                                    BeginPtr(vec), len));
@@ -173,8 +201,11 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
    std::vector<float> vec(len);
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < len; ++i) {
-      vec[i] = REAL(array)[i];
+      exc.Run([&]() {
+        vec[i] = REAL(array)[i];
+      });
    }
+    exc.Rethrow();
    CHECK_CALL(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
                                     CHAR(asChar(field)),
                                     BeginPtr(vec), len));
@@ -265,11 +296,15 @@ SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
      << "gradient and hess must have same length";
  int len = length(grad);
  std::vector<float> tgrad(len), thess(len);
+  dmlc::OMPException exc;
  #pragma omp parallel for schedule(static)
  for (int j = 0; j < len; ++j) {
-    tgrad[j] = REAL(grad)[j];
-    thess[j] = REAL(hess)[j];
+    exc.Run([&]() {
+      tgrad[j] = REAL(grad)[j];
+      thess[j] = REAL(hess)[j];
+    });
  }
+  exc.Rethrow();
  CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
                                 R_ExternalPtrAddr(dtrain),
                                 BeginPtr(tgrad), BeginPtr(thess),
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -21,6 +21,19 @@
 */
 XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);

+/*!
+ * \brief Set global configuration
+ * \param json_str a JSON string representing the list of key-value pairs
+ * \return R_NilValue
+ */
+XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
+
+/*!
+ * \brief Get global configuration
+ * \return JSON string
+ */
+XGB_DLL SEXP XGBGetGlobalConfig_R();
+
 /*!
 * \brief load a data matrix
 * \param fname name of the content
--- a/R-package/src/xgboost_custom.cc
+++ b/R-package/src/xgboost_custom.cc
@@ -16,7 +16,7 @@ void CustomLogMessage::Log(const std::string& msg) {
 namespace xgboost {
 ConsoleLogger::~ConsoleLogger() {
  if (cur_verbosity_ == LogVerbosity::kIgnore ||
-      cur_verbosity_ <= global_verbosity_) {
+      cur_verbosity_ <= GlobalVerbosity()) {
    dmlc::CustomLogMessage::Log(log_stream_.str());
  }
 }
--- a/R-package/tests/helper_scripts/generate_models.R
+++ b/R-package/tests/helper_scripts/generate_models.R
@@ -2,7 +2,6 @@
 # of saved model files from XGBoost version 0.90 and 1.0.x.
 library(xgboost)
 library(Matrix)
-source('./generate_models_params.R')

 set.seed(0)
 metadata <- list(
@@ -53,11 +52,16 @@ generate_logistic_model <- function () {
  y <- sample(0:1, size = metadata$kRows, replace = TRUE)
  stopifnot(max(y) == 1, min(y) == 0)

-  data <- xgb.DMatrix(X, label = y, weight = w)
-  params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
-                 max_depth = metadata$kMaxDepth, objective = 'binary:logistic')
-  booster <- xgb.train(params, data, nrounds = metadata$kRounds)
-  save_booster(booster, 'logit')
+  objective <- c('binary:logistic', 'binary:logitraw')
+  name <- c('logit', 'logitraw')
+
+  for (i in seq_len(length(objective))) {
+    data <- xgb.DMatrix(X, label = y, weight = w)
+    params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
+                   max_depth = metadata$kMaxDepth, objective = objective[i])
+    booster <- xgb.train(params, data, nrounds = metadata$kRounds)
+    save_booster(booster, name[i])
+  }
 }

 generate_classification_model <- function () {
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -66,7 +66,7 @@ test_that("parameter validation works", {
      xgb.train(params = params, data = dtrain, nrounds = nrounds))
    print(output)
  }
-  expect_output(incorrect(), "bar, foo")
+  expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"')
 })


--- a/R-package/tests/testthat/test_config.R
+++ b/R-package/tests/testthat/test_config.R
@@ -0,0 +1,21 @@
+context('Test global configuration')
+
+test_that('Global configuration works with verbosity', {
+  old_verbosity <- xgb.get.config()$verbosity
+  for (v in c(0, 1, 2, 3)) {
+    xgb.set.config(verbosity = v)
+    expect_equal(xgb.get.config()$verbosity, v)
+  }
+  xgb.set.config(verbosity = old_verbosity)
+  expect_equal(xgb.get.config()$verbosity, old_verbosity)
+})
+
+test_that('Global configuration works with use_rmm flag', {
+  old_use_rmm_flag <- xgb.get.config()$use_rmm
+  for (v in c(TRUE, FALSE)) {
+    xgb.set.config(use_rmm = v)
+    expect_equal(xgb.get.config()$use_rmm, v)
+  }
+  xgb.set.config(use_rmm = old_use_rmm_flag)
+  expect_equal(xgb.get.config()$use_rmm, old_use_rmm_flag)
+})
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -39,6 +39,10 @@ run_booster_check <- function (booster, name) {
    testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
                           metadata$kClasses)
+  } else if (name == 'logitraw') {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
+    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
+    testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logitraw')
  } else if (name == 'logit') {
    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
 [![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
 [![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/)
+[![Conda version](https://img.shields.io/conda/vn/conda-forge/py-xgboost.svg)](https://anaconda.org/conda-forge/py-xgboost)
 [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org)
 [![Twitter](https://img.shields.io/badge/@XGBoostProject--_.svg?style=social&logo=twitter)](https://twitter.com/XGBoostProject)

--- a/amalgamation/xgboost-all0.cc
+++ b/amalgamation/xgboost-all0.cc
@@ -14,6 +14,7 @@
 #include "../src/metric/elementwise_metric.cc"
 #include "../src/metric/multiclass_metric.cc"
 #include "../src/metric/rank_metric.cc"
+#include "../src/metric/auc.cc"
 #include "../src/metric/survival_metric.cc"

 // objectives
@@ -67,6 +68,7 @@
 // global
 #include "../src/learner.cc"
 #include "../src/logging.cc"
+#include "../src/global_config.cc"
 #include "../src/common/common.cc"
 #include "../src/common/random.cc"
 #include "../src/common/charconv.cc"
--- a/cmake/Python_version.in
+++ b/cmake/Python_version.in
@@ -1 +1 @@
-@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@-SNAPSHOT
+@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@
--- a/cmake/RPackageInstallTargetSetup.cmake
+++ b/cmake/RPackageInstallTargetSetup.cmake
@@ -6,11 +6,11 @@ function(setup_rpackage_install_target rlib_target build_dir)
  install(
    DIRECTORY "${xgboost_SOURCE_DIR}/R-package"
    DESTINATION "${build_dir}"
-    REGEX "src/*" EXCLUDE
-    REGEX "R-package/configure" EXCLUDE
+    PATTERN "src/*" EXCLUDE
+    PATTERN "R-package/configure" EXCLUDE
  )
  install(TARGETS ${rlib_target}
    LIBRARY DESTINATION "${build_dir}/R-package/src/"
    RUNTIME DESTINATION "${build_dir}/R-package/src/")
  install(SCRIPT ${PROJECT_BINARY_DIR}/RPackageInstall.cmake)
-endfunction()
+endfunction()
--- a/demo/CLI/binary_classification/README.md
+++ b/demo/CLI/binary_classification/README.md
@@ -62,7 +62,7 @@ test:data = "agaricus.txt.test"
 We use the tree booster and logistic regression objective in our setting. This indicates that we accomplish our task using classic gradient boosting regression tree(GBRT), which is a promising method for binary classification.

 The parameters shown in the example gives the most common ones that are needed to use xgboost.
-If you are interested in more parameter settings, the complete parameter settings and detailed descriptions are [here](../../doc/parameter.rst). Besides putting the parameters in the configuration file, we can set them by passing them as arguments as below:
+If you are interested in more parameter settings, the complete parameter settings and detailed descriptions are [here](https://xgboost.readthedocs.io/en/stable/parameter.html). Besides putting the parameters in the configuration file, we can set them by passing them as arguments as below:

 ```
 ../../xgboost mushroom.conf max_depth=6
@@ -161,4 +161,3 @@ Eg. ```nthread=10```

 Set nthread to be the number of your real cpu (On Unix, this can be found using ```lscpu```)
 Some systems will have ```Thread(s) per core = 2```, for example, a 4 core cpu with 8 threads, in such case set ```nthread=4``` and not 8.
-
--- a/demo/CLI/regression/README.md
+++ b/demo/CLI/regression/README.md
@@ -1,6 +1,6 @@
 Regression
 ====
-Using XGBoost for regression is very similar to using it for binary classification. We suggest that you can refer to the [binary classification demo](../binary_classification) first. In XGBoost if we use negative log likelihood as the loss function for regression, the training procedure is same as training binary classifier of XGBoost. 
+Using XGBoost for regression is very similar to using it for binary classification. We suggest that you can refer to the [binary classification demo](../binary_classification) first. In XGBoost if we use negative log likelihood as the loss function for regression, the training procedure is same as training binary classifier of XGBoost.

 ### Tutorial
 The dataset we used is the [computer hardware dataset from UCI repository](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware). The demo for regression is almost the same as the [binary classification demo](../binary_classification), except a little difference in general parameter:
@@ -14,4 +14,3 @@ objective = reg:squarederror
 ```

 The input format is same as binary classification, except that the label is now the target regression values. We use linear regression here, if we want use objective = reg:logistic logistic regression, the label needed to be pre-scaled into [0,1].
-
--- a/demo/README.md
+++ b/demo/README.md
@@ -60,9 +60,9 @@ This is a list of short codes introducing different functionalities of xgboost p
 Most of examples in this section are based on CLI or python version.
 However, the parameter settings can be applied to all versions

- [Binary classification](binary_classification)
+- [Binary classification](CLI/binary_classification)
 - [Multiclass classification](multiclass_classification)
- [Regression](regression)
+- [Regression](CLI/regression)
 - [Learning to Rank](rank)

 ### Benchmarks
@@ -110,6 +110,7 @@ Please send pull requests if you find ones that are missing here.

 ## Tutorials

+- [XGBoost Training with Dask, using Saturn Cloud](https://www.saturncloud.io/docs/tutorials/xgboost/)
 - [Machine Learning with XGBoost on Qubole Spark Cluster](https://www.qubole.com/blog/machine-learning-xgboost-qubole-spark-cluster/)
 - [XGBoost Official RMarkdown Tutorials](https://xgboost.readthedocs.org/en/latest/R-package/index.html#tutorials)
 - [An Introduction to XGBoost R Package](http://dmlc.ml/rstats/2016/03/10/xgboost.html) by Tong He
@@ -144,6 +145,8 @@ Send a PR to add a one sentence description:)
 ## Tools using XGBoost

 - [BayesBoost](https://github.com/mpearmain/BayesBoost) - Bayesian Optimization using xgboost and sklearn API
+- [FLAML](https://github.com/microsoft/FLAML) - An open source AutoML library 
+designed to automatically produce accurate machine learning models with low computational cost. FLAML includes [XGBoost as one of the default learners](https://github.com/microsoft/FLAML/blob/main/flaml/model.py) and can also be used as a fast hyperparameter tuning tool for XGBoost ([code example](https://github.com/microsoft/FLAML/blob/main/notebook/flaml_xgboost.ipynb)).
 - [gp_xgboost_gridsearch](https://github.com/vatsan/gp_xgboost_gridsearch) - In-database parallel grid-search for XGBoost on [Greenplum](https://github.com/greenplum-db/gpdb) using PL/Python
 - [tpot](https://github.com/rhiever/tpot) - A Python tool that automatically creates and optimizes machine learning pipelines using genetic programming.

--- a/demo/guide-python/feature_weights.py
+++ b/demo/guide-python/feature_weights.py
@@ -28,10 +28,10 @@ def main(args):
                         'colsample_bynode': 0.5},
                        dtrain, num_boost_round=10,
                        evals=[(dtrain, 'd')])
-    featue_map = bst.get_fscore()
+    feature_map = bst.get_fscore()
    # feature zero has 0 weight
-    assert featue_map.get('f0', None) is None
-    assert max(featue_map.values()) == featue_map.get('f9')
+    assert feature_map.get('f0', None) is None
+    assert max(feature_map.values()) == feature_map.get('f9')

    if args.plot:
        xgboost.plot_importance(bst)
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -1,21 +1,54 @@
 import os
 import numpy as np
 import xgboost as xgb
+from sklearn.datasets import load_svmlight_file

-# load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
-num_round = 3
-bst = xgb.train(param, dtrain, num_round, watchlist)
+train = os.path.join(CURRENT_DIR, "../data/agaricus.txt.train")
+test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test")

-print('start testing prediction from first n trees')
-# predict using first 1 tree
-label = dtest.get_label()
-ypred1 = bst.predict(dtest, ntree_limit=1)
-# by default, we predict using all the trees
-ypred2 = bst.predict(dtest)
-print('error of ypred1=%f' % (np.sum((ypred1 > 0.5) != label) / float(len(label))))
-print('error of ypred2=%f' % (np.sum((ypred2 > 0.5) != label) / float(len(label))))
+
+def native_interface():
+    # load data in do training
+    dtrain = xgb.DMatrix(train)
+    dtest = xgb.DMatrix(test)
+    param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+    num_round = 3
+    bst = xgb.train(param, dtrain, num_round, watchlist)
+
+    print("start testing prediction from first n trees")
+    # predict using first 1 tree
+    label = dtest.get_label()
+    ypred1 = bst.predict(dtest, iteration_range=(0, 1))
+    # by default, we predict using all the trees
+    ypred2 = bst.predict(dtest)
+
+    print("error of ypred1=%f" % (np.sum((ypred1 > 0.5) != label) / float(len(label))))
+    print("error of ypred2=%f" % (np.sum((ypred2 > 0.5) != label) / float(len(label))))
+
+
+def sklearn_interface():
+    X_train, y_train = load_svmlight_file(train)
+    X_test, y_test = load_svmlight_file(test)
+    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1, use_label_encoder=False)
+    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+    assert clf.n_classes_ == 2
+
+    print("start testing prediction from first n trees")
+    # predict using first 1 tree
+    ypred1 = clf.predict(X_test, iteration_range=(0, 1))
+    # by default, we predict using all the trees
+    ypred2 = clf.predict(X_test)
+
+    print(
+        "error of ypred1=%f" % (np.sum((ypred1 > 0.5) != y_test) / float(len(y_test)))
+    )
+    print(
+        "error of ypred2=%f" % (np.sum((ypred2 > 0.5) != y_test) / float(len(y_test)))
+    )
+
+
+if __name__ == "__main__":
+    native_interface()
+    sklearn_interface()
--- a/demo/rank/mq2008.conf
+++ b/demo/rank/mq2008.conf
@@ -5,9 +5,9 @@ objective="rank:pairwise"

 # Tree Booster Parameters
 # step size shrinkage
-eta = 0.1 
+eta = 0.1
 # minimum loss reduction required to make a further partition
-gamma = 1.0 
+gamma = 1.0
 # minimum sum of instance weight(hessian) needed in a child
 min_child_weight = 0.1
 # maximum depth of a tree
@@ -17,12 +17,10 @@ max_depth = 6
 # the number of round to do boosting
 num_round = 4
 # 0 means do not save any model except the final round model
-save_period = 0 
+save_period = 0
 # The path of training data
-data = "mq2008.train" 
+data = "mq2008.train"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "mq2008.vali" 
-# The path of test data 
-test:data = "mq2008.test"      
-
-
+eval[test] = "mq2008.vali"
+# The path of test data
+test:data = "mq2008.test"
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -5,13 +5,16 @@ from dask.distributed import Client
 from dask_cuda import LocalCUDACluster

 def main(client):
+    # Inform XGBoost that RMM is used for GPU memory allocation
+    xgb.set_config(use_rmm=True)
+
    X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
    X = dask.array.from_array(X)
    y = dask.array.from_array(y)
    dtrain = xgb.dask.DaskDMatrix(client, X, label=y)

    params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
-              'tree_method': 'gpu_hist'}
+              'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
    output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
                            evals=[(dtrain, 'train')])
    bst = output['booster']
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -4,6 +4,8 @@ from sklearn.datasets import make_classification

 # Initialize RMM pool allocator
 rmm.reinitialize(pool_allocator=True)
+# Inform XGBoost that RMM is used for GPU memory allocation
+xgb.set_config(use_rmm=True)

 X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
 dtrain = xgb.DMatrix(X, label=y)
--- a/dev/query_contributors.py
+++ b/dev/query_contributors.py
@@ -27,7 +27,7 @@ def paginate_request(url, callback):
        r = requests.get(r.links['next']['url'], auth=(username, password))
        callback(json.loads(r.text))

-for line in git.log(f'{from_commit}..{to_commit}', '--pretty=format:%s', '--reverse'):
+for line in git.log(f'{from_commit}..{to_commit}', '--pretty=format:%s', '--reverse', '--first-parent'):
    m = re.search('\(#([0-9]+)\)$', line.rstrip())
    if m:
        pr_id = m.group(1)
--- a/dev/release-pypi.py
+++ b/dev/release-pypi.py
@@ -0,0 +1,129 @@
+"""Simple script for downloading and checking pypi release wheels.
+
+tqdm, sh are required to run this script.
+"""
+from urllib.request import urlretrieve
+import argparse
+from typing import List
+from sh.contrib import git
+from distutils import version
+import subprocess
+import tqdm
+import os
+
+# The package building is managed by Jenkins CI.
+PREFIX = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/release_"
+DIST = os.path.join(os.path.curdir, "python-package", "dist")
+
+pbar = None
+
+
+def show_progress(block_num, block_size, total_size):
+    "Show file download progress."
+    global pbar
+    if pbar is None:
+        pbar = tqdm.tqdm(total=total_size / 1024, unit="kB")
+
+    downloaded = block_num * block_size
+    if downloaded < total_size:
+        pbar.update(block_size / 1024)
+    else:
+        pbar.close()
+        pbar = None
+
+
+def retrieve(url, filename=None):
+    return urlretrieve(url, filename, reporthook=show_progress)
+
+
+def lastest_hash() -> str:
+    "Get latest commit hash."
+    ret = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True)
+    assert ret.returncode == 0, "Failed to get lastest commit hash."
+    commit_hash = ret.stdout.decode("utf-8").strip()
+    return commit_hash
+
+
+def download_wheels(
+    platforms: List[str],
+    dir_URL: str,
+    src_filename_prefix: str,
+    target_filename_prefix: str,
+) -> List[str]:
+    """Download all binary wheels. dir_URL is the URL for remote directory storing the release
+    wheels
+
+    """
+
+    filenames = []
+    for platform in platforms:
+        src_wheel = src_filename_prefix + platform + ".whl"
+        url = dir_URL + src_wheel
+
+        target_wheel = target_filename_prefix + platform + ".whl"
+        filename = os.path.join(DIST, target_wheel)
+        filenames.append(filename)
+        print("Downloading from:", url, "to:", filename)
+        retrieve(url=url, filename=filename)
+        ret = subprocess.run(["twine", "check", filename], capture_output=True)
+        assert ret.returncode == 0, "Failed twine check"
+        stderr = ret.stderr.decode("utf-8")
+        stdout = ret.stdout.decode("utf-8")
+        assert stderr.find("warning") == -1, "Unresolved warnings:\n" + stderr
+        assert stdout.find("warning") == -1, "Unresolved warnings:\n" + stdout
+
+    return filenames
+
+
+def check_path():
+    root = os.path.abspath(os.path.curdir)
+    assert os.path.basename(root) == "xgboost", "Must be run on project root."
+
+
+def main(args: argparse.Namespace) -> None:
+    check_path()
+
+    rel = version.StrictVersion(args.release)
+    platforms = [
+        "win_amd64",
+        "manylinux2010_x86_64",
+        "manylinux2014_aarch64",
+        "macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64",
+    ]
+    print("Release:", rel)
+    major, minor, patch = rel.version
+    branch = "release_" + str(major) + "." + str(minor) + ".0"
+    git.clean("-xdf")
+    git.checkout(branch)
+    git.pull("origin", branch)
+    git.submodule("update")
+    commit_hash = lastest_hash()
+
+    dir_URL = PREFIX + str(major) + "." + str(minor) + ".0" + "/"
+    src_filename_prefix = "xgboost-" + args.release + "%2B" + commit_hash + "-py3-none-"
+    target_filename_prefix = "xgboost-" + args.release + "-py3-none-"
+
+    if not os.path.exists(DIST):
+        os.mkdir(DIST)
+
+    filenames = download_wheels(
+        platforms, dir_URL, src_filename_prefix, target_filename_prefix
+    )
+    print("List of downloaded wheels:", filenames)
+    print(
+        """
+Following steps should be done manually:
+- Generate source package by running `python setup.py sdist`.
+- Upload pypi package by `python3 -m twine upload dist/<Package Name>` for all wheels.
+- Check the uploaded files on `https://pypi.org/project/xgboost/<VERSION>/#files` and `pip
+  install xgboost==<VERSION>` """
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--release", type=str, required=True, help="Version tag, e.g. '1.3.2'."
+    )
+    args = parser.parse_args()
+    main(args)
--- a/dev/release-tarball.sh
+++ b/dev/release-tarball.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+# Helper script for creating release tarball.
+
+print_usage() {
+    printf "Script for making release source tarball.\n"
+    printf "Usage:\n\trelease-tarball.sh <TAG>\n\n"
+}
+
+print_error() {
+    local msg=$1
+    printf "\u001b[31mError\u001b[0m: $msg\n\n"
+    print_usage
+}
+
+check_input() {
+    local TAG=$1
+    if [ -z $TAG ]; then
+        print_error "Empty tag argument"
+        exit -1
+    fi
+}
+
+check_curdir() {
+    local CUR_ABS=$1
+    printf "Current directory: ${CUR_ABS}\n"
+    local CUR=$(basename $CUR_ABS)
+
+    if [ $CUR == "dev" ]; then
+        cd ..
+        CUR=$(basename $(pwd))
+    fi
+
+    if [ $CUR != "xgboost" ]; then
+        print_error "Must be in project root or xgboost/dev.  Current directory: ${CUR}"
+        exit -1;
+    fi
+}
+
+# Remove all submodules.
+cleanup_git() {
+    local TAG=$1
+    check_input $TAG
+
+    git checkout $TAG || exit -1
+
+    local SUBMODULES=$(grep "path = " ./.gitmodules | cut -f 3 --delimiter=' ' -)
+
+    for module in $SUBMODULES; do
+        rm -rf ${module}/.git
+    done
+
+    rm -rf .git
+}
+
+make_tarball() {
+    local SRCDIR=$1
+    local CUR_ABS=$2
+    tar -czf xgboost.tar.gz xgboost
+
+    printf "Copying ${SRCDIR}/xgboost.tar.gz back to ${CUR_ABS}/xgboost.tar.gz .\n"
+    cp xgboost.tar.gz ${CUR_ABS}/xgboost.tar.gz
+    printf "Writing hash to ${CUR_ABS}/hash .\n"
+    sha256sum -z ${CUR_ABS}/xgboost.tar.gz | cut -f 1 --delimiter=' ' > ${CUR_ABS}/hash
+}
+
+main() {
+    local TAG=$1
+    check_input $TAG
+
+    local CUR_ABS=$(pwd)
+    check_curdir $CUR_ABS
+
+    local TMPDIR=$(mktemp -d)
+    printf "tmpdir: ${TMPDIR}\n"
+
+    git clean -xdf || exit -1
+    cp -R . $TMPDIR/xgboost
+    pushd .
+
+    cd $TMPDIR/xgboost
+    cleanup_git $TAG
+
+    cd ..
+    make_tarball $TMPDIR $CUR_ABS
+
+    popd
+    rm -rf $TMPDIR
+}
+
+main $1
--- a/2
+++ b/2
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -2,18 +2,15 @@
 Installation Guide
 ##################

-.. note:: Pre-built binary wheel for Python
+.. note:: Pre-built binary wheel for Python: now with GPU support

-  If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, available from Python Package Index (PyPI). You may download and install it by running
+  If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, to avoid the trouble of building XGBoost from the source. You may download and install it by running

  .. code-block:: bash

-    # Ensure that you are downloading one of the following:
-    #   * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
-    #   * xgboost-{version}-py2.py3-none-win_amd64.whl
    pip3 install xgboost

-  * The binary wheel will support GPU algorithms (`gpu_hist`) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
+  * The binary wheel will support the GPU algorithm (``gpu_hist``) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
  * Currently, we provide binary wheels for 64-bit Linux, macOS and Windows.
  * Nightly builds are available. You can go to `this page
    <https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/list.html>`_, find the
@@ -23,6 +20,21 @@ Installation Guide

      pip install <url to the wheel>

+.. note:: (EXPERIMENTAL) Pre-built binary package for R: now with GPU support
+
+  If you are planning to use R, consider installing ``{xgboost}`` from a pre-built binary package, to avoid the trouble of building XGBoost from the source. The binary package will let you use the GPU algorithm (``gpu_hist``) out of the box, as long as your machine has NVIDIA GPUs.
+
+  Download the binary package from the Releases page. The file name will be of the form ``xgboost_r_gpu_linux_[version].tar.gz``. Then install XGBoost by running:
+
+  .. code-block:: bash
+
+    # Install dependencies
+    R -q -e "install.packages(c('data.table', 'magrittr', 'jsonlite', 'remotes'))"
+    # Install XGBoost
+    R CMD INSTALL ./xgboost_r_gpu_linux.tar.gz
+
+  Currently, we provide the binary package for 64-bit Linux.
+

 ****************************
 Building XGBoost from source
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -94,6 +94,8 @@ extensions = [
    'recommonmark'
 ]

+autodoc_typehints = "description"
+
 graphviz_output_format = 'png'
 plot_formats = [('svg', 300), ('png', 100), ('hires.png', 300)]
 plot_html_show_source_link = False
--- a/doc/contrib/release.rst
+++ b/doc/contrib/release.rst
@@ -11,3 +11,20 @@ Starting from XGBoost 1.0.0, each XGBoost release will be versioned as [MAJOR].[
 * MAJOR: We gurantee the API compatibility across releases with the same major version number. We expect to have a 1+ years development period for a new MAJOR release version.
 * FEATURE: We ship new features, improvements and bug fixes through feature releases. The cycle length of a feature is decided by the size of feature roadmap. The roadmap is decided right after the previous release.
 * MAINTENANCE: Maintenance version only contains bug fixes. This type of release only occurs when we found significant correctness and/or performance bugs and barrier for users to upgrade to a new version of XGBoost smoothly.
+
+
+Making a Release
+-----------------
+
+1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue.
+2. Bump release version.
+   1. Modify ``CMakeLists.txt`` source tree, run CMake.
+   2. Modify ``DESCRIPTION`` in R-package.
+   3. Run ``change_version.sh`` in ``jvm-packages/dev``
+3. Commit the change, create a PR on github on release branch.  Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.
+4. Create a tag on release branch, either on github or locally.
+5. Make a release on github tag page, which might be done with previous step if the tag is created on github.
+6. Submit pip, cran and maven packages.
+   - pip package is maintained by [Hyunsu Cho](http://hyunsu-cho.io/) and [Jiaming Yuan](https://github.com/trivialfis).  There's a helper script for downloading pre-built wheels on ``xgboost/dev/release-pypi.py`` along with simple instructions for using ``twine``.
+   - cran package is maintained by [Tong He](https://github.com/hetong007).
+   - maven packageis maintained by [Nan Zhu](https://github.com/CodingCat).
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -22,6 +22,7 @@ Contents
  XGBoost User Forum <https://discuss.xgboost.ai>
  GPU support <gpu/index>
  parameter
+  treemethod
  Python package <python/index>
  R package <R-package/index>
  JVM package <jvm/index>
--- a/doc/model.schema
+++ b/doc/model.schema
@@ -88,6 +88,12 @@
                      "type": "number"
                    }
                  },
+                  "split_type": {
+                    "type": "array",
+                    "items": {
+                      "type": "integer"
+                    }
+                  },
                  "default_left": {
                    "type": "array",
                    "items": {
@@ -247,6 +253,18 @@
    "learner": {
      "type": "object",
      "properties": {
+        "feature_names": {
+          "type": "array",
+          "items": {
+              "type": "string"
+          }
+        },
+        "feature_types": {
+          "type": "array",
+          "items": {
+              "type": "string"
+          }
+        },
        "gradient_booster": {
          "oneOf": [
            {
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -16,6 +16,14 @@ Before running XGBoost, we must set three types of parameters: general parameter
  :backlinks: none
  :local:

+********************
+Global Configuration
+********************
+The following parameters can be set in the global scope, using ``xgb.config_context()`` (Python) or ``xgb.set.config()`` (R).
+
+* ``verbosity``: Verbosity of printing messages. Valid values of 0 (silent), 1 (warning), 2 (info), and 3 (debug).
+* ``use_rmm``: Whether to use RAPIDS Memory Manager (RMM) to allocate GPU memory. This option is only applicable when XGBoost is built (compiled) with the RMM plugin enabled. Valid values are ``true`` and ``false``.
+
 ******************
 General Parameters
 ******************
@@ -67,8 +75,8 @@ Parameters for Tree Booster

 * ``max_depth`` [default=6]

-  - Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in ``lossguided`` growing policy when tree_method is set as ``hist`` and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
-  - range: [0,∞] (0 is only accepted in ``lossguided`` growing policy when tree_method is set as ``hist``)
+  - Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in ``lossguided`` growing policy when tree_method is set as ``hist`` or ``gpu_hist`` and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
+  - range: [0,∞] (0 is only accepted in ``lossguided`` growing policy when tree_method is set as ``hist`` or ``gpu_hist``)

 * ``min_child_weight`` [default=1]

@@ -123,7 +131,7 @@ Parameters for Tree Booster

 * ``tree_method`` string [default= ``auto``]

-  - The tree construction algorithm used in XGBoost. See description in the `reference paper <http://arxiv.org/abs/1603.02754>`_.
+  - The tree construction algorithm used in XGBoost. See description in the `reference paper <http://arxiv.org/abs/1603.02754>`_ and :doc:`treemethod`.
  - XGBoost supports  ``approx``, ``hist`` and ``gpu_hist`` for distributed training.  Experimental support for external memory is available for ``approx`` and ``gpu_hist``.

  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, ``gpu_hist``, this is a
@@ -188,7 +196,7 @@ Parameters for Tree Booster
 * ``grow_policy`` [default= ``depthwise``]

  - Controls a way new nodes are added to the tree.
-  - Currently supported only if ``tree_method`` is set to ``hist``.
+  - Currently supported only if ``tree_method`` is set to ``hist`` or ``gpu_hist``.
  - Choices: ``depthwise``, ``lossguide``

    - ``depthwise``: split at nodes closest to the root.
@@ -200,7 +208,7 @@ Parameters for Tree Booster

 * ``max_bin``, [default=256]

-  - Only used if ``tree_method`` is set to ``hist``.
+  - Only used if ``tree_method`` is set to ``hist`` or ``gpu_hist``.
  - Maximum number of discrete bins to bucket continuous features.
  - Increasing this number improves the optimality of splits at the cost of higher computation time.

@@ -392,8 +400,16 @@ Specify the learning task and the corresponding learning objective. The objectiv
    - ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'.
    - ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``.
    - ``mlogloss``: `Multiclass logloss <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html>`_.
-    - ``auc``: `Area under the curve <http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve>`_
-    - ``aucpr``: `Area under the PR curve <https://en.wikipedia.org/wiki/Precision_and_recall>`_
+    - ``auc``: `Receiver Operating Characteristic Area under the Curve <http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve>`_.
+      Available for classification and learning-to-rank tasks.
+
+      - When used with binary classification, the objective should be ``binary:logistic`` or similar functions that work on probability.
+      - When used with multi-class classification, objective should be ``multi:softprob`` instead of ``multi:softmax``, as the latter doesn't output probability.  Also the AUC is calculated by 1-vs-rest with reference class weighted by class prevalence.
+      - When used with LTR task, the AUC is computed by comparing pairs of documents to count correctly sorted pairs.  This corresponds to pairwise learning to rank.  The implementation has some issues with average AUC around groups and distributed workers not being well-defined.
+      - On a single machine the AUC calculation is exact. In a distributed environment the AUC is a weighted average over the AUC of training rows on each node - therefore, distributed AUC is an approximation sensitive to the distribution of data across workers. Use another metric in distributed environments if precision and reproducibility are important.
+      - If input dataset contains only negative or positive samples the output is `NaN`.
+
+    - ``aucpr``: `Area under the PR curve <https://en.wikipedia.org/wiki/Precision_and_recall>`_. Available for binary classification and learning-to-rank tasks.
    - ``ndcg``: `Normalized Discounted Cumulative Gain <http://en.wikipedia.org/wiki/NDCG>`_
    - ``map``: `Mean Average Precision <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_
    - ``ndcg@n``, ``map@n``: 'n' can be assigned as an integer to cut off the top positions in the lists for evaluation.
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -6,6 +6,14 @@ This page gives the Python API reference of xgboost, please also refer to Python
  :backlinks: none
  :local:

+Global Configuration
+--------------------
+.. autofunction:: xgboost.config_context
+
+.. autofunction:: xgboost.set_config
+
+.. autofunction:: xgboost.get_config
+
 Core Data Structure
 -------------------
 .. automodule:: xgboost.core
@@ -85,9 +93,15 @@ Dask API
 --------
 .. automodule:: xgboost.dask

-.. autofunction:: xgboost.dask.DaskDMatrix
+.. autoclass:: xgboost.dask.DaskDMatrix
+    :members:
+    :inherited-members:
+    :show-inheritance:

-.. autofunction:: xgboost.dask.DaskDeviceQuantileDMatrix
+.. autoclass:: xgboost.dask.DaskDeviceQuantileDMatrix
+    :members:
+    :inherited-members:
+    :show-inheritance:

 .. autofunction:: xgboost.dask.train

@@ -95,6 +109,27 @@ Dask API

 .. autofunction:: xgboost.dask.inplace_predict

-.. autofunction:: xgboost.dask.DaskXGBClassifier
+.. autoclass:: xgboost.dask.DaskXGBClassifier
+    :members:
+    :inherited-members:
+    :show-inheritance:

-.. autofunction:: xgboost.dask.DaskXGBRegressor
+.. autoclass:: xgboost.dask.DaskXGBRegressor
+    :members:
+    :inherited-members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.dask.DaskXGBRanker
+    :members:
+    :inherited-members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.dask.DaskXGBRFRegressor
+    :members:
+    :inherited-members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.dask.DaskXGBRFClassifier
+    :members:
+    :inherited-members:
+    :show-inheritance:
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -0,0 +1,101 @@
+####################
+XGBoost Tree Methods
+####################
+
+For training boosted tree models, there are 2 parameters used for choosing algorithms,
+namely ``updater`` and ``tree_method``.  XGBoost has 4 builtin tree methods, namely
+``exact``, ``approx``, ``hist`` and ``gpu_hist``.  Along with these tree methods, there
+are also some free standing updaters including ``grow_local_histmaker``, ``refresh``,
+``prune`` and ``sync``.  The parameter ``updater`` is more primitive than ``tree_method``
+as the latter is just a pre-configuration of the former.  The difference is mostly due to
+historical reasons that each updater requires some specific configurations and might has
+missing features.  As we are moving forward, the gap between them is becoming more and
+more irrevelant.  We will collectively document them under tree methods.
+
+**************
+Exact Solution
+**************
+
+Exact means XGBoost considers all candidates from data for tree splitting, but underlying
+the objective is still interpreted as a Taylor expansion.
+
+1. ``exact``: Vanilla tree boosting tree algorithm described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  During each split finding procedure, it iterates
+   over every entry of input data.  It's more accurate (among other greedy methods) but
+   slow in computation performance.  Also it doesn't support distributed training as
+   XGBoost employs row spliting data distribution while ``exact`` tree method works on a
+   sorted column format.  This tree method can be used with parameter ``tree_method`` set
+   to ``exact``.
+
+
+**********************
+Approximated Solutions
+**********************
+
+As ``exact`` tree method is slow in performance and not scalable, we often employ
+approximated training algorithms.  These algorithms build a gradient histogram for each
+node and iterate through the histogram instead of real dataset.  Here we introduce the
+implementations in XGBoost below.
+
+1. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
+   paper <http://arxiv.org/abs/1603.02754>`_.  This updater is rarely used in practice so
+   it's still an updater rather than tree method.  During split finding, it first runs a
+   weighted GK sketching for data points belong to current node to find split candidates,
+   using hessian as weights.  The histogram is built upon this per-node sketch.  It's
+   faster than ``exact`` in some applications, but still slow in computation.
+
+2. ``approx`` tree method: An approximation tree method described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  Different from ``grow_local_histmaker``, it runs
+   sketching before building each tree using all the rows (rows belonging to the root)
+   instead of per-node dataset.  Similar to ``grow_local_histmaker`` updater, hessian is
+   used as weights during sketch.  The algorithm can be accessed by setting
+   ``tree_method`` to ``approx``.
+
+3. ``hist`` tree method: An approximation tree method used in LightGBM with slight
+   differences in implementation.  It runs sketching before training using only user
+   provided weights instead of hessian.  The subsequent per-node histogram is built upon
+   this global sketch.  This is the fastest algorithm as it runs sketching only once.  The
+   algorithm can be accessed by setting ``tree_method`` to ``hist``.
+
+4. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
+   ``hist``, with additional support for gradient based sampling.  The algorithm can be
+   accessed by setting ``tree_method`` to ``gpu_hist``.
+
+************
+Implications
+************
+
+Some objectives like ``reg:squarederror`` have constant hessian.  In this case, ``hist``
+or ``gpu_hist`` should be preferred as weighted sketching doesn't make sense with constant
+weights.  When using non-constant hessian objectives, sometimes ``approx`` yields better
+accuracy, but with slower computation performance.  Most of the time using ``(gpu)_hist``
+with higher ``max_bin`` can achieve similar or even superior accuracy while maintaining
+good performance.  However, as xgboost is largely driven by community effort, the actual
+implementations have some differences than pure math description.  Result might have
+slight differences than expectation, which we are currently trying to overcome.
+
+**************
+Other Updaters
+**************
+
+1. ``Pruner``: It prunes the built tree by ``gamma`` parameter.  ``pruner`` is usually
+   used as part of other tree methods.
+2. ``Refresh``: Refresh the statistic of bulilt trees on a new training dataset.
+3. ``Sync``: Synchronize the tree among workers when running distributed training.
+
+****************
+Removed Updaters
+****************
+
+2 Updaters were removed during development due to maintainability.  We describe them here
+solely for the interest of documentation.  First one is distributed colmaker, which was a
+distributed version of exact tree method.  It required specialization for column based
+spliting strategy and a different prediction procedure.  As the exact tree method is slow
+by itself and scaling is even less efficient, we removed it entirely.  Second one is
+``skmaker``.  Per-node weighted sketching employed by ``grow_local_histmaker`` is slow,
+the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the
+histogram creation step and uses sketching values directly during split evaluation.  It
+was never tested and contained some unknown bugs, we decided to remove it and focus our
+resources on more promising algorithms instead.  For accuracy, most of the time
+``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tunning, so removing
+them don't have any real practical impact.
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -51,12 +51,12 @@ on a dask cluster:
  num_obs = 1e5
  num_features = 20
  X = da.random.random(
-      size=(num_obs, num_features)
+      size=(num_obs, num_features),
+      chunks=(1000, num_features)
  )
-  y = da.random.choice(
-      a=[0, 1],
-      size=num_obs,
-      replace=True
+  y = da.random.random(
+      size=(num_obs, 1),
+      chunks=(1000, 1)
  )

  dtrain = xgb.dask.DaskDMatrix(client, X, y)
@@ -64,7 +64,7 @@ on a dask cluster:
  output = xgb.dask.train(client,
                          {'verbosity': 2,
                           'tree_method': 'hist',
-                           'objective': 'binary:logistic'
+                           'objective': 'reg:squarederror'
                           },
                          dtrain,
                          num_boost_round=4, evals=[(dtrain, 'train')])
@@ -95,17 +95,28 @@ For prediction, pass the ``output`` returned by ``train`` into ``xgb.dask.predic
 .. code-block:: python

  prediction = xgb.dask.predict(client, output, dtrain)
+  # Or equivalently, pass ``output['booster']``:
+  prediction = xgb.dask.predict(client, output['booster'], dtrain)

-Or equivalently, pass ``output['booster']``:
+Eliminating the construction of DaskDMatrix is also possible, this can make the
+computation a bit faster when meta information like ``base_margin`` is not needed:

 .. code-block:: python

-  prediction = xgb.dask.predict(client, output['booster'], dtrain)
+  prediction = xgb.dask.predict(client, output, X)
+  # Use inplace version.
+  prediction = xgb.dask.inplace_predict(client, output, X)

-Here ``prediction`` is a dask ``Array`` object containing predictions from model.
+Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
+is a ``DaskDMatrix`` or ``da.Array``.  When putting dask collection directly into the
+``predict`` function or using ``inplace_predict``, the output type depends on input data.
+See next section for details.

-Alternatively, XGBoost also implements the Scikit-Learn interface with ``DaskXGBClassifier``
-and ``DaskXGBRegressor``. See ``xgboost/demo/dask`` for more examples.
+Alternatively, XGBoost also implements the Scikit-Learn interface with
+``DaskXGBClassifier``, ``DaskXGBRegressor``, ``DaskXGBRanker`` and 2 random forest
+variances.  This wrapper is similar to the single node Scikit-Learn interface in xgboost,
+with dask collection as inputs and has an additional ``client`` attribute.  See
+``xgboost/demo/dask`` for more examples.


 ******************
@@ -136,9 +147,49 @@ Also for inplace prediction:
 .. code-block:: python

  booster.set_param({'predictor': 'gpu_predictor'})
-  # where X is a dask DataFrame or dask Array.
+  # where X is a dask DataFrame or dask Array containing cupy or cuDF backed data.
  prediction = xgb.dask.inplace_predict(client, booster, X)

+When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
+type is ``dd.DataFrame``, output can be ``dd.Series``, ``dd.DataFrame`` or ``da.Array``,
+depending on output shape.  For example, when shap based prediction is used, the return
+value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned.
+
+The performance of running prediction, either using ``predict`` or ``inplace_predict``, is
+sensitive to number of blocks.  Internally, it's implemented using ``da.map_blocks`` or
+``dd.map_partitions``.  When number of partitions is large and each of them have only
+small amount of data, the overhead of calling predict becomes visible.  On the other hand,
+if not using GPU, the number of threads used for prediction on each block matters.  Right
+now, xgboost uses single thread for each partition.  If the number of blocks on each
+workers is smaller than number of cores, then the CPU workers might not be fully utilized.
+
+One simple optimization for running consecutive predictions is using
+``distributed.Future``:
+
+.. code-block:: python
+
+    dataset = [X_0, X_1, X_2]
+    booster_f = client.scatter(booster, broadcast=True)
+    futures = []
+    for X in dataset:
+        # Here we pass in a future instead of concrete booster
+        shap_f = xgb.dask.predict(client, booster_f, X, pred_contribs=True)
+        futures.append(shap_f)
+
+  results = client.gather(futures)
+
+
+This is only available on functional interface, as the Scikit-Learn wrapper doesn't know
+how to maintain a valid future for booster.  To obtain the booster object from
+Scikit-Learn wrapper object:
+
+.. code-block:: python
+
+    cls = xgb.dask.DaskXGBClassifier()
+    cls.fit(X, y)
+
+    booster = cls.get_booster()
+

 ***************************
 Working with other clusters
@@ -209,17 +260,17 @@ will override the configuration in Dask.  For example:
  with dask.distributed.LocalCluster(n_workers=7, threads_per_worker=4) as cluster:

 There are 4 threads allocated for each dask worker.  Then by default XGBoost will use 4
-threads in each process for both training and prediction.  But if ``nthread`` parameter is
-set:
+threads in each process for training.  But if ``nthread`` parameter is set:

 .. code-block:: python

-  output = xgb.dask.train(client,
-                          {'verbosity': 1,
-                           'nthread': 8,
-                           'tree_method': 'hist'},
-                          dtrain,
-                          num_boost_round=4, evals=[(dtrain, 'train')])
+    output = xgb.dask.train(
+        client,
+        {"verbosity": 1, "nthread": 8, "tree_method": "hist"},
+        dtrain,
+        num_boost_round=4,
+        evals=[(dtrain, "train")],
+    )

 XGBoost will use 8 threads in each training process.

@@ -252,12 +303,12 @@ Functional interface:
        with_X = await xgb.dask.predict(client, output, X)
        inplace = await xgb.dask.inplace_predict(client, output, X)

-        # Use `client.compute` instead of the `compute` method from dask collection
+        # Use ``client.compute`` instead of the ``compute`` method from dask collection
        print(await client.compute(with_m))


 While for the Scikit-Learn interface, trivial methods like ``set_params`` and accessing class
-attributes like ``evals_result_`` do not require ``await``.  Other methods involving
+attributes like ``evals_result()`` do not require ``await``.  Other methods involving
 actual computation will return a coroutine and hence require awaiting:

 .. code-block:: python
@@ -273,6 +324,126 @@ actual computation will return a coroutine and hence require awaiting:
        # Use `client.compute` instead of the `compute` method from dask collection
        print(await client.compute(prediction))

+*****************************
+Evaluation and Early Stopping
+*****************************
+
+.. versionadded:: 1.3.0
+
+The Dask interface allows the use of validation sets that are stored in distributed collections (Dask DataFrame or Dask Array). These can be used for evaluation and early stopping.
+
+To enable early stopping, pass one or more validation sets containing ``DaskDMatrix`` objects.
+
+.. code-block:: python
+
+    import dask.array as da
+    import xgboost as xgb
+
+    num_rows = 1e6
+    num_features = 100
+    num_partitions = 10
+    rows_per_chunk = num_rows / num_partitions
+
+    data = da.random.random(
+        size=(num_rows, num_features),
+        chunks=(rows_per_chunk, num_features)
+    )
+
+    labels = da.random.random(
+        size=(num_rows, 1),
+        chunks=(rows_per_chunk, 1)
+    )
+
+    X_eval = da.random.random(
+        size=(num_rows, num_features),
+        chunks=(rows_per_chunk, num_features)
+    )
+
+    y_eval = da.random.random(
+        size=(num_rows, 1),
+        chunks=(rows_per_chunk, 1)
+    )
+
+    dtrain = xgb.dask.DaskDMatrix(
+        client=client,
+        data=data,
+        label=labels
+    )
+
+    dvalid = xgb.dask.DaskDMatrix(
+        client=client,
+        data=X_eval,
+        label=y_eval
+    )
+
+    result = xgb.dask.train(
+        client=client,
+        params={
+            "objective": "reg:squarederror",
+        },
+        dtrain=dtrain,
+        num_boost_round=10,
+        evals=[(dvalid, "valid1")],
+        early_stopping_rounds=3
+    )
+
+When validation sets are provided to ``xgb.dask.train()`` in this way, the model object returned by ``xgb.dask.train()`` contains a history of evaluation metrics for each validation set, across all boosting rounds.
+
+.. code-block:: python
+
+    print(result["history"])
+    # {'valid1': OrderedDict([('rmse', [0.28857, 0.28858, 0.288592, 0.288598])])}
+
+If early stopping is enabled by also passing ``early_stopping_rounds``, you can check the best iteration in the returned booster.
+
+.. code-block:: python
+
+    booster = result["booster"]
+    print(booster.best_iteration)
+    best_model = booster[: booster.best_iteration]
+
+
+*******************
+Other customization
+*******************
+
+XGBoost dask interface accepts other advanced features found in single node Python
+interface, including callback functions, custom evaluation metric and objective:
+
+.. code-block:: python
+
+    def eval_error_metric(predt, dtrain: xgb.DMatrix):
+        label = dtrain.get_label()
+        r = np.zeros(predt.shape)
+        gt = predt > 0.5
+        r[gt] = 1 - label[gt]
+        le = predt <= 0.5
+        r[le] = label[le]
+        return 'CustomErr', np.sum(r)
+
+    # custom callback
+    early_stop = xgb.callback.EarlyStopping(
+        rounds=early_stopping_rounds,
+        metric_name="CustomErr",
+        data_name="Train",
+        save_best=True,
+    )
+
+    booster = xgb.dask.train(
+        client,
+        params={
+            "objective": "binary:logistic",
+            "eval_metric": ["error", "rmse"],
+            "tree_method": "hist",
+        },
+        dtrain=D_train,
+        evals=[(D_train, "Train"), (D_valid, "Valid")],
+        feval=eval_error_metric,  # custom evaluation metric
+        num_boost_round=100,
+        callbacks=[early_stop],
+    )
+
+
 *****************************************************************************
 Why is the initialization of ``DaskDMatrix``  so slow and throws weird errors
 *****************************************************************************
@@ -314,16 +485,3 @@ References:

 #. https://github.com/dask/dask/issues/6833
 #. https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array
-
-***********
-Limitations
-***********
-
-Basic functionality including model training and generating classification and regression predictions
-have been implemented.  However, there are still some other limitations we haven't
-addressed yet:
-
- Label encoding for the ``DaskXGBClassifier`` classifier may not be supported.  So users need
-  to encode their training labels into discrete values first.
- Ranking is not yet supported.
- Callback functions are not tested.
--- a/doc/tutorials/model.rst
+++ b/doc/tutorials/model.rst
@@ -2,7 +2,6 @@
 Introduction to Boosted Trees
 #############################
 XGBoost stands for "Extreme Gradient Boosting", where the term "Gradient Boosting" originates from the paper *Greedy Function Approximation: A Gradient Boosting Machine*, by Friedman.
-This is a tutorial on gradient boosted trees, and most of the content is based on `these slides <http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf>`_ by Tianqi Chen, the original author of XGBoost.

 The **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
 This tutorial will explain boosted trees in a self-contained and principled way using the elements of supervised learning.
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -9,10 +9,9 @@ open format that can be easily reused.  The support for binary format will be co
 the future until JSON format is no-longer experimental and has satisfying performance.
 This tutorial aims to share some basic insights into the JSON serialisation method used in
 XGBoost.  Without explicitly mentioned, the following sections assume you are using the
-experimental JSON format, which can be enabled by passing
-``enable_experimental_json_serialization=True`` as training parameter, or provide the file
-name with ``.json`` as file extension when saving/loading model:
-``booster.save_model('model.json')``.  More details below.
+JSON format, which can be enabled by providing the file name with ``.json`` as file
+extension when saving/loading model: ``booster.save_model('model.json')``.  More details
+below.

 Before we get started, XGBoost is a gradient boosting library with focus on tree model,
 which means inside XGBoost, there are 2 distinct parts:
@@ -66,26 +65,7 @@ a filename with ``.json`` as file extension:

  xgb.save(bst, 'model_file_name.json')

-To use JSON to store memory snapshots, add ``enable_experimental_json_serialization`` as a training
-parameter.  In Python this can be done by:
-
-.. code-block:: python
-
-  bst = xgboost.train({'enable_experimental_json_serialization': True}, dtrain)
-  with open('filename', 'wb') as fd:
-      pickle.dump(bst, fd)
-
-Notice the ``filename`` is for Python intrinsic function ``open``, not for XGBoost.  Hence
-parameter ``enable_experimental_json_serialization`` is required to enable JSON format.
-
-Similarly, in the R package, add ``enable_experimental_json_serialization`` to the training
-parameter:
-
-.. code-block:: r
-
-  params <- list(enable_experimental_json_serialization = TRUE, ...)
-  bst <- xgboost.train(params, dtrain, nrounds = 10)
-  saveRDS(bst, 'filename.rds')
+While for memory snapshot, JSON is the default starting with xgboost 1.3.

 ***************************************************************
 A note on backward compatibility of models and memory snapshots
@@ -110,11 +90,11 @@ Custom objective and metric
 ***************************

 XGBoost accepts user provided objective and metric functions as an extension.  These
-functions are not saved in model file as they are language dependent feature.  With
+functions are not saved in model file as they are language dependent features.  With
 Python, user can pickle the model to include these functions in saved binary.  One
 drawback is, the output from pickle is not a stable serialization format and doesn't work
-on different Python version or XGBoost version, not to mention different language
-environment.  Another way to workaround this limitation is to provide these functions
+on different Python version nor XGBoost version, not to mention different language
+environments.  Another way to workaround this limitation is to provide these functions
 again after the model is loaded. If the customized function is useful, please consider
 making a PR for implementing it inside XGBoost, this way we can have your functions
 working with different language bindings.
@@ -128,9 +108,9 @@ models are valuable.  One way to restore it in the future is to load it back wit
 specific version of Python and XGBoost, export the model by calling `save_model`.  To help
 easing the mitigation, we created a simple script for converting pickled XGBoost 0.90
 Scikit-Learn interface object to XGBoost 1.0.0 native model.  Please note that the script
-suits simple use cases, and it's advised not to use pickle when stability is needed.
-It's located in ``xgboost/doc/python`` with the name ``convert_090to100.py``.  See
-comments in the script for more details.
+suits simple use cases, and it's advised not to use pickle when stability is needed.  It's
+located in ``xgboost/doc/python`` with the name ``convert_090to100.py``.  See comments in
+the script for more details.

 A similar procedure may be used to recover the model persisted in an old RDS file. In R, you are
 able to install an older version of XGBoost using the ``remotes`` package:
@@ -172,7 +152,6 @@ Will print out something similiar to (not actual output as it's too long for dem
    {
      "Learner": {
        "generic_parameter": {
-          "enable_experimental_json_serialization": "0",
          "gpu_id": "0",
          "gpu_page_size": "0",
          "n_jobs": "0",
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -55,7 +55,7 @@
 #endif  // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4)

 #if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \
-    !defined(__CUDACC__)
+    !defined(__CUDACC__) && !defined(__sun) && !defined(sun)
 #include <parallel/algorithm>
 #define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z))
 #define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -63,6 +63,23 @@ XGB_DLL const char *XGBGetLastError(void);
 */
 XGB_DLL int XGBRegisterLogCallback(void (*callback)(const char*));

+/*!
+ * \brief Set global configuration (collection of parameters that apply globally). This function
+ *        accepts the list of key-value pairs representing the global-scope parameters to be
+ *        configured. The list of key-value pairs are passed in as a JSON string.
+ * \param json_str a JSON string representing the list of key-value pairs. The JSON object shall
+ *                 be flat: no value can be a JSON object or an array.
+ * \return 0 for success, -1 for failure
+ */
+XGB_DLL int XGBSetGlobalConfig(const char* json_str);
+
+/*!
+ * \brief Get current global configuration (collection of parameters that apply globally).
+ * \param json_str pointer to received returned global configuration, represented as a JSON string.
+ * \return 0 for success, -1 for failure
+ */
+XGB_DLL int XGBGetGlobalConfig(const char** json_str);
+
 /*!
 * \brief load a data matrix
 * \param fname the name of the file
@@ -81,7 +98,7 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname,
 * \param data fvalue
 * \param nindptr number of rows in the matrix + 1
 * \param nelem number of nonzero elements in the matrix
- * \param num_col number of columns; when it's set to 0, then guess from data
+ * \param num_col number of columns; when it's set to kAdapterUnknownSize, then guess from data
 * \param out created dmatrix
 * \return 0 when success, -1 when failure happens
 */
@@ -92,6 +109,27 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
                                     size_t nelem,
                                     size_t num_col,
                                     DMatrixHandle* out);
+
+/*!
+ * \brief Create a matrix from CSR matrix.
+ * \param indptr  JSON encoded __array_interface__ to row pointers in CSR.
+ * \param indices JSON encoded __array_interface__ to column indices in CSR.
+ * \param data    JSON encoded __array_interface__ to values in CSR.
+ * \param num_col Number of columns.
+ * \param json_config JSON encoded configuration.  Required values are:
+ *
+ *          - missing
+ *          - nthread
+ *
+ * \param out created dmatrix
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
+                                   char const *indices, char const *data,
+                                   bst_ulong ncol,
+                                   char const* json_config,
+                                   DMatrixHandle* out);
+
 /*!
 * \brief create a matrix content from CSC format
 * \param col_ptr pointer to col headers
@@ -597,6 +635,15 @@ XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
                           int end_layer, int step,
                           BoosterHandle *out);

+/*!
+ * \brief Get number of boosted rounds from gradient booster.  When process_type is
+ *        update, this number might drop due to removed tree.
+ * \param handle Handle to booster.
+ * \param out Pointer to output integer.
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterBoostedRounds(BoosterHandle handle, int* out);
+
 /*!
 * \brief set parameters
 * \param handle handle
@@ -657,8 +704,9 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
                                 const char *evnames[],
                                 bst_ulong len,
                                 const char **out_result);
+
 /*!
- * \brief make prediction based on dmat
+ * \brief make prediction based on dmat (deprecated, use `XGBoosterPredictFromDMatrix` instead)
 * \param handle handle
 * \param dmat data matrix
 * \param option_mask bit-mask of options taken in prediction, possible values
@@ -687,6 +735,167 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
                             int training,
                             bst_ulong *out_len,
                             const float **out_result);
+/*!
+ * \brief Make prediction from DMatrix, replacing `XGBoosterPredict`.
+ *
+ * \param handle Booster handle
+ * \param dmat   DMatrix handle
+ * \param c_json_config String encoded predict configuration in JSON format, with
+ *                      following available fields in the JSON object:
+ *
+ *    "type": [0, 6]
+ *      0: normal prediction
+ *      1: output margin
+ *      2: predict contribution
+ *      3: predict approximated contribution
+ *      4: predict feature interaction
+ *      5: predict approximated feature interaction
+ *      6: predict leaf
+ *    "training": bool
+ *      Whether the prediction function is used as part of a training loop.  **Not used
+ *      for inplace prediction**.
+ *
+ *      Prediction can be run in 2 scenarios:
+ *        1. Given data matrix X, obtain prediction y_pred from the model.
+ *        2. Obtain the prediction for computing gradients. For example, DART booster performs dropout
+ *           during training, and the prediction result will be different from the one obtained by normal
+ *           inference step due to dropped trees.
+ *      Set training=false for the first scenario. Set training=true for the second
+ *      scenario.  The second scenario applies when you are defining a custom objective
+ *      function.
+ *    "iteration_begin": int
+ *      Beginning iteration of prediction.
+ *    "iteration_end": int
+ *      End iteration of prediction.  Set to 0 this will become the size of tree model (all the trees).
+ *    "strict_shape": bool
+ *      Whether should we reshape the output with stricter rules.  If set to true,
+ *      normal/margin/contrib/interaction predict will output consistent shape
+ *      disregarding the use of multi-class model, and leaf prediction will output 4-dim
+ *      array representing: (n_samples, n_iterations, n_classes, n_trees_in_forest)
+ *
+ *   Run a normal prediction with strict output shape, 2 dim for softprob , 1 dim for others.
+ *   \code
+ *      {
+ *         "type": 0,
+ *         "training": False,
+ *         "iteration_begin": 0,
+ *         "iteration_end": 0,
+ *         "strict_shape": true,
+ *     }
+ *   \endcode
+ *
+ * \param out_shape Shape of output prediction (copy before use).
+ * \param out_dim   Dimension of output prediction.
+ * \param out_result Buffer storing prediction value (copy before use).
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
+                                        DMatrixHandle dmat,
+                                        char const* c_json_config,
+                                        bst_ulong const **out_shape,
+                                        bst_ulong *out_dim,
+                                        float const **out_result);
+/*
+ * \brief Inplace prediction from CPU dense matrix.
+ *
+ * \param handle        Booster handle.
+ * \param values        JSON encoded __array_interface__ to values.
+ * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info.
+ *
+ *   Additional fields for inplace prediction are:
+ *     "missing": float
+ *
+ * \param m             An optional (NULL if not available) proxy DMatrix instance
+ *                      storing meta info.
+ *
+ * \param out_shape     See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_dim       See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_result    See `XGBoosterPredictFromDMatrix` for more info.
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle,
+                                      char const *values,
+                                      char const *c_json_config,
+                                      DMatrixHandle m,
+                                      bst_ulong const **out_shape,
+                                      bst_ulong *out_dim,
+                                      const float **out_result);
+
+/*
+ * \brief Inplace prediction from CPU CSR matrix.
+ *
+ * \param handle        Booster handle.
+ * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
+ * \param indices       JSON encoded __array_interface__ to column indices in CSR.
+ * \param values        JSON encoded __array_interface__ to values in CSR..
+ * \param ncol          Number of features in data.
+ * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info.
+ *   Additional fields for inplace prediction are:
+ *     "missing": float
+ *
+ * \param m             An optional (NULL if not available) proxy DMatrix instance
+ *                      storing meta info.
+ *
+ * \param out_shape     See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_dim       See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_result    See `XGBoosterPredictFromDMatrix` for more info.
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr,
+                                    char const *indices, char const *values,
+                                    bst_ulong ncol,
+                                    char const *c_json_config, DMatrixHandle m,
+                                    bst_ulong const **out_shape,
+                                    bst_ulong *out_dim,
+                                    const float **out_result);
+
+/*
+ * \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
+ *
+ * \param handle        Booster handle
+ * \param values        JSON encoded __cuda_array_interface__ to values.
+ * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info.
+ *   Additional fields for inplace prediction are:
+ *     "missing": float
+ *
+ * \param m             An optional (NULL if not available) proxy DMatrix instance
+ *                      storing meta info.
+ * \param out_shape     See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_dim       See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_result    See `XGBoosterPredictFromDMatrix` for more info.
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterPredictFromCudaArray(
+    BoosterHandle handle, char const *values, char const *c_json_config,
+    DMatrixHandle m, bst_ulong const **out_shape, bst_ulong *out_dim,
+    const float **out_result);
+
+/*
+ * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
+ *
+ * \param handle        Booster handle
+ * \param values        List of __cuda_array_interface__ for all columns encoded in JSON list.
+ * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info.
+ *   Additional fields for inplace prediction are:
+ *     "missing": float
+ *
+ * \param m             An optional (NULL if not available) proxy DMatrix instance
+ *                      storing meta info.
+ * \param out_shape     See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_dim       See `XGBoosterPredictFromDMatrix` for more info.
+ * \param out_result    See `XGBoosterPredictFromDMatrix` for more info.
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterPredictFromCudaColumnar(
+    BoosterHandle handle, char const *values, char const *c_json_config,
+    DMatrixHandle m, bst_ulong const **out_shape, bst_ulong *out_dim,
+    const float **out_result);
+

 /*
 * ========================== Begin Serialization APIs =========================
@@ -925,4 +1134,46 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
 XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
                                  bst_ulong* out_len,
                                  const char*** out);
+
+/*!
+ * \brief Set string encoded feature info in Booster, similar to the feature
+ *        info in DMatrix.
+ *
+ * Accepted fields are:
+ *   - feature_name
+ *   - feature_type
+ *
+ * \param handle    An instance of Booster
+ * \param field     Feild name
+ * \param features  Pointer to array of strings.
+ * \param size      Size of `features` pointer (number of strings passed in).
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
+                                       const char **features,
+                                       const bst_ulong size);
+
+/*!
+ * \brief Get string encoded feature info from Booster, similar to feature info
+ *        in DMatrix.
+ *
+ * Accepted fields are:
+ *   - feature_name
+ *   - feature_type
+ *
+ * Caller is responsible for copying out the data, before next call to any API
+ * function of XGBoost.
+ *
+ * \param handle       An instance of Booster
+ * \param field        Feild name
+ * \param size         Size of output pointer `features` (number of strings returned).
+ * \param out_features Address of a pointer to array of strings. Result is stored in
+ *        thread local memory.
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
+                                       bst_ulong *len,
+                                       const char ***out_features);
 #endif  // XGBOOST_C_API_H_
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -252,15 +252,6 @@ class SparsePage {
  /*! \brief an instance of sparse vector in the batch */
  using Inst = common::Span<Entry const>;

-  /*! \brief get i-th row from the batch */
-  inline Inst operator[](size_t i) const {
-    const auto& data_vec = data.HostVector();
-    const auto& offset_vec = offset.HostVector();
-    size_t size = offset_vec[i + 1] - offset_vec[i];
-    return {data_vec.data() + offset_vec[i],
-            static_cast<Inst::index_type>(size)};
-  }
-
  HostSparsePageView GetView() const {
    return {offset.ConstHostSpan(), data.ConstHostSpan()};
  }
@@ -299,15 +290,19 @@ class SparsePage {

  void SortRows() {
    auto ncol = static_cast<bst_omp_uint>(this->Size());
-#pragma omp parallel for default(none) shared(ncol) schedule(dynamic, 1)
+    dmlc::OMPException exc;
+#pragma omp parallel for schedule(dynamic, 1)
    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
-        std::sort(
-            this->data.HostVector().begin() + this->offset.HostVector()[i],
-            this->data.HostVector().begin() + this->offset.HostVector()[i + 1],
-            Entry::CmpValue);
-      }
+      exc.Run([&]() {
+        if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
+          std::sort(
+              this->data.HostVector().begin() + this->offset.HostVector()[i],
+              this->data.HostVector().begin() + this->offset.HostVector()[i + 1],
+              Entry::CmpValue);
+        }
+      });
    }
+    exc.Rethrow();
  }

  /**
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2014-2020 by Contributors
+ * Copyright 2014-2021 by Contributors
 * \file gbm.h
 * \brief Interface of gradient booster,
 *  that learns through gradient statistics.
@@ -63,7 +63,7 @@ class GradientBooster : public Model, public Configurable {
  /*!
   * \brief Slice a model using boosting index. The slice m:n indicates taking all trees
   *        that were fit during the boosting rounds m, (m+1), (m+2), ..., (n-1).
-   * \param layer_begin Begining of boosted tree layer used for prediction.
+   * \param layer_begin Beginning of boosted tree layer used for prediction.
   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param out         Output gradient booster
   */
@@ -79,6 +79,9 @@ class GradientBooster : public Model, public Configurable {
  virtual bool AllowLazyCheckPoint() const {
    return false;
  }
+  /*! \brief Return number of boosted rounds.
+   */
+  virtual int32_t BoostedRounds() const = 0;
  /*!
   * \brief perform update to the model(boosting)
   * \param p_fmat feature matrix that provide access to features
@@ -96,15 +99,14 @@ class GradientBooster : public Model, public Configurable {
   * \param out_preds output vector to hold the predictions
   * \param training Whether the prediction value is used for training.  For dart booster
   *                 drop out is performed during training.
-   * \param ntree_limit limit the number of trees used in prediction,
-   *                    when it equals 0, this means we do not limit
-   *                    number of trees, this parameter is only valid
-   *                    for gbtree, but not for gblinear
+   * \param layer_begin Beginning of boosted tree layer used for prediction.
+   * \param layer_end   End of booster layer. 0 means do not limit trees.
   */
  virtual void PredictBatch(DMatrix* dmat,
                            PredictionCacheEntry* out_preds,
                            bool training,
-                            unsigned ntree_limit = 0) = 0;
+                            unsigned layer_begin,
+                            unsigned layer_end) = 0;

  /*!
   * \brief Inplace prediction.
@@ -112,10 +114,10 @@ class GradientBooster : public Model, public Configurable {
   * \param           x                      A type erased data adapter.
   * \param           missing                Missing value in the data.
   * \param [in,out]  out_preds              The output preds.
-   * \param           layer_begin (Optional) Begining of boosted tree layer used for prediction.
+   * \param           layer_begin (Optional) Beginning of boosted tree layer used for prediction.
   * \param           layer_end   (Optional) End of booster layer. 0 means do not limit trees.
   */
-  virtual void InplacePredict(dmlc::any const &, float,
+  virtual void InplacePredict(dmlc::any const &, std::shared_ptr<DMatrix>, float,
                              PredictionCacheEntry*,
                              uint32_t,
                              uint32_t) const {
@@ -129,44 +131,45 @@ class GradientBooster : public Model, public Configurable {
   *
   * \param inst the instance you want to predict
   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction
+   * \param layer_begin Beginning of boosted tree layer used for prediction.
+   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \sa Predict
   */
  virtual void PredictInstance(const SparsePage::Inst& inst,
                               std::vector<bst_float>* out_preds,
-                               unsigned ntree_limit = 0) = 0;
+                               unsigned layer_begin, unsigned layer_end) = 0;
  /*!
   * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
   *        this is only valid in gbtree predictor
   * \param dmat feature matrix
   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
-   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
+   * \param layer_begin Beginning of boosted tree layer used for prediction.
+   * \param layer_end   End of booster layer. 0 means do not limit trees.
   */
-  virtual void PredictLeaf(DMatrix* dmat,
-                           HostDeviceVector<bst_float>* out_preds,
-                           unsigned ntree_limit = 0) = 0;
+  virtual void PredictLeaf(DMatrix *dmat,
+                           HostDeviceVector<bst_float> *out_preds,
+                           unsigned layer_begin, unsigned layer_end) = 0;

  /*!
   * \brief feature contributions to individual predictions; the output will be a vector
   *         of length (nfeats + 1) * num_output_group * nsample, arranged in that order
   * \param dmat feature matrix
   * \param out_contribs output vector to hold the contributions
-   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
-   *    we do not limit number of trees
+   * \param layer_begin Beginning of boosted tree layer used for prediction.
+   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param approximate use a faster (inconsistent) approximation of SHAP values
   * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
   * \param condition_feature feature to condition on (i.e. fix) during calculations
   */
  virtual void PredictContribution(DMatrix* dmat,
                                   HostDeviceVector<bst_float>* out_contribs,
-                                   unsigned ntree_limit = 0,
+                                   unsigned layer_begin, unsigned layer_end,
                                   bool approximate = false, int condition = 0,
                                   unsigned condition_feature = 0) = 0;

-  virtual void PredictInteractionContributions(DMatrix* dmat,
-                                               HostDeviceVector<bst_float>* out_contribs,
-                                               unsigned ntree_limit, bool approximate) = 0;
+  virtual void PredictInteractionContributions(
+      DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
+      unsigned layer_begin, unsigned layer_end, bool approximate) = 0;

  /*!
   * \brief dump the model in the requested format
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -11,6 +11,7 @@
 #include <string>

 namespace xgboost {
+
 struct GenericParameter : public XGBoostParameter<GenericParameter> {
  // Constant representing the device ID of CPU.
  static int32_t constexpr kCpuId = -1;
@@ -26,9 +27,10 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
  int nthread;
  // primary device, -1 means no gpu.
  int gpu_id;
+  // fail when gpu_id is invalid
+  bool fail_on_invalid_gpu_id {false};
  // gpu page size in external memory mode, 0 means using the default.
  size_t gpu_page_size;
-  bool enable_experimental_json_serialization {true};
  bool validate_parameters {false};

  void CheckDeprecated() {
@@ -64,14 +66,13 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
        .set_default(-1)
        .set_lower_bound(-1)
        .describe("The primary GPU device ordinal.");
+    DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
+        .set_default(false)
+        .describe("Fail with error when gpu_id is invalid.");
    DMLC_DECLARE_FIELD(gpu_page_size)
        .set_default(0)
        .set_lower_bound(0)
        .describe("GPU page size when running in external memory mode.");
-    DMLC_DECLARE_FIELD(enable_experimental_json_serialization)
-        .set_default(true)
-        .describe("Enable using JSON for memory serialization (Python Pickle, "
-                  "rabit checkpoints etc.).");
    DMLC_DECLARE_FIELD(validate_parameters)
        .set_default(false)
        .describe("Enable checking whether parameters are used or not.");
--- a/include/xgboost/global_config.h
+++ b/include/xgboost/global_config.h
@@ -0,0 +1,34 @@
+/*!
+ * Copyright 2020 by Contributors
+ * \file global_config.h
+ * \brief Global configuration for XGBoost
+ * \author Hyunsu Cho
+ */
+#ifndef XGBOOST_GLOBAL_CONFIG_H_
+#define XGBOOST_GLOBAL_CONFIG_H_
+
+#include <xgboost/parameter.h>
+#include <vector>
+#include <string>
+
+namespace xgboost {
+class Json;
+
+struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
+  int verbosity { 1 };
+  bool use_rmm { false };
+  DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
+    DMLC_DECLARE_FIELD(verbosity)
+        .set_range(0, 3)
+        .set_default(1)  // shows only warning
+        .describe("Flag to print out detailed breakdown of runtime.");
+    DMLC_DECLARE_FIELD(use_rmm)
+        .set_default(false)
+        .describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
+  }
+};
+
+using GlobalConfigThreadLocalStore = dmlc::ThreadLocalStore<GlobalConfiguration>;
+}  // namespace xgboost
+
+#endif  // XGBOOST_GLOBAL_CONFIG_H_
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright (c) by XGBoost Contributors 2019-2020
+ * Copyright (c) by XGBoost Contributors 2019-2021
 */
 #ifndef XGBOOST_JSON_H_
 #define XGBOOST_JSON_H_
@@ -301,12 +301,15 @@ class JsonBoolean : public Value {
 struct StringView {
 private:
  using CharT = char;  // unsigned char
+  using Traits = std::char_traits<CharT>;
  CharT const* str_;
  size_t size_;

 public:
  StringView() = default;
  StringView(CharT const* str, size_t size) : str_{str}, size_{size} {}
+  explicit StringView(std::string const& str): str_{str.c_str()}, size_{str.size()} {}
+  explicit StringView(CharT const* str) : str_{str}, size_{Traits::length(str)} {}

  CharT const& operator[](size_t p) const { return str_[p]; }
  CharT const& at(size_t p) const {  // NOLINT
@@ -322,9 +325,16 @@ struct StringView {
    CHECK_LE(beg, size_);
    return std::string {str_ + beg, n < (size_ - beg) ? n : (size_ - beg)};
  }
-  char const* c_str() const { return str_; }  // NOLINT
+  CharT const* c_str() const { return str_; }  // NOLINT
+
+  CharT const* cbegin() const { return str_; }         // NOLINT
+  CharT const* cend() const { return str_ + size(); }  // NOLINT
+  CharT const* begin() const { return str_; }          // NOLINT
+  CharT const* end() const { return str_ + size(); }   // NOLINT
 };

+std::ostream &operator<<(std::ostream &os, StringView const v);
+
 /*!
 * \brief Data structure representing JSON format.
 *
@@ -557,7 +567,6 @@ using String  = JsonString;
 using Null    = JsonNull;

 // Utils tailored for XGBoost.
-
 template <typename Parameter>
 Object ToJson(Parameter const& param) {
  Object obj;
@@ -568,13 +577,13 @@ Object ToJson(Parameter const& param) {
 }

 template <typename Parameter>
-void FromJson(Json const& obj, Parameter* param) {
+Args FromJson(Json const& obj, Parameter* param) {
  auto const& j_param = get<Object const>(obj);
  std::map<std::string, std::string> m;
  for (auto const& kv : j_param) {
    m[kv.first] = get<String const>(kv.second);
  }
-  param->UpdateAllowUnknown(m);
+  return param->UpdateAllowUnknown(m);
 }
 }  // namespace xgboost
 #endif  // XGBOOST_JSON_H_
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015-2020 by Contributors
+ * Copyright 2015-2021 by Contributors
 * \file learner.h
 * \brief Learner interface that integrates objective, gbm and evaluation together.
 *  This is the user facing XGBoost training module.
@@ -30,6 +30,16 @@ class ObjFunction;
 class DMatrix;
 class Json;

+enum class PredictionType : std::uint8_t {  // NOLINT
+  kValue = 0,
+  kMargin = 1,
+  kContribution = 2,
+  kApproxContribution = 3,
+  kInteraction = 4,
+  kApproxInteraction = 5,
+  kLeaf = 6
+};
+
 /*! \brief entry to to easily hold returning information */
 struct XGBAPIThreadLocalEntry {
  /*! \brief result holder for returning string */
@@ -42,10 +52,12 @@ struct XGBAPIThreadLocalEntry {
  std::vector<bst_float> ret_vec_float;
  /*! \brief temp variable of gradient pairs. */
  std::vector<GradientPair> tmp_gpair;
+  /*! \brief Temp variable for returing prediction result. */
  PredictionCacheEntry prediction_entry;
+  /*! \brief Temp variable for returing prediction shape. */
+  std::vector<bst_ulong> prediction_shape;
 };

-
 /*!
 * \brief Learner class that does training and prediction.
 *  This is the user facing module of xgboost training.
@@ -102,8 +114,8 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \param data input data
   * \param output_margin whether to only predict margin value instead of transformed prediction
   * \param out_preds output vector that stores the prediction
-   * \param ntree_limit limit number of trees used for boosted tree
-   *   predictor, when it equals 0, this means we are using all the trees
+   * \param layer_begin Beginning of boosted tree layer used for prediction.
+   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param training Whether the prediction result is used for training
   * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
   * \param pred_contribs whether to only predict the feature contributions
@@ -113,7 +125,8 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
  virtual void Predict(std::shared_ptr<DMatrix> data,
                       bool output_margin,
                       HostDeviceVector<bst_float> *out_preds,
-                       unsigned ntree_limit = 0,
+                       unsigned layer_begin,
+                       unsigned layer_end,
                       bool training = false,
                       bool pred_leaf = false,
                       bool pred_contribs = false,
@@ -124,17 +137,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \brief Inplace prediction.
   *
   * \param          x           A type erased data adapter.
+   * \param          p_m         An optional Proxy DMatrix object storing meta info like
+   *                             base margin.  Can be nullptr.
   * \param          type        Prediction type.
   * \param          missing     Missing value in the data.
   * \param [in,out] out_preds   Pointer to output prediction vector.
-   * \param          layer_begin (Optional) Begining of boosted tree layer used for prediction.
-   * \param          layer_end   (Optional) End of booster layer. 0 means do not limit trees.
+   * \param          layer_begin Beginning of boosted tree layer used for prediction.
+   * \param          layer_end   End of booster layer. 0 means do not limit trees.
   */
-  virtual void InplacePredict(dmlc::any const& x, std::string const& type,
+  virtual void InplacePredict(dmlc::any const &x,
+                              std::shared_ptr<DMatrix> p_m,
+                              PredictionType type,
                              float missing,
                              HostDeviceVector<bst_float> **out_preds,
                              uint32_t layer_begin, uint32_t layer_end) = 0;

+  /*
+   * \brief Get number of boosted rounds from gradient booster.
+   */
+  virtual int32_t BoostedRounds() const = 0;
+  virtual uint32_t Groups() const = 0;
+
  void LoadModel(Json const& in) override = 0;
  void SaveModel(Json* out) const override = 0;

@@ -161,7 +184,7 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \brief Get the number of features of the booster.
   * \return number of features
   */
-  virtual uint32_t GetNumFeature() = 0;
+  virtual uint32_t GetNumFeature() const = 0;

  /*!
   * \brief Set additional attribute to the Booster.
@@ -191,6 +214,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \return vector of attribute name strings.
   */
  virtual std::vector<std::string> GetAttrNames() const = 0;
+  /*!
+   * \brief Set the feature names for current booster.
+   * \param fn Input feature names
+   */
+  virtual  void SetFeatureNames(std::vector<std::string> const& fn) = 0;
+  /*!
+   * \brief Get the feature names for current booster.
+   * \param fn Output feature names
+   */
+  virtual void GetFeatureNames(std::vector<std::string>* fn) const = 0;
+  /*!
+   * \brief Set the feature types for current booster.
+   * \param ft Input feature types.
+   */
+  virtual void SetFeatureTypes(std::vector<std::string> const& ft) = 0;
+  /*!
+   * \brief Get the feature types for current booster.
+   * \param fn Output feature types
+   */
+  virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;
+
  /*!
   * \return whether the model allow lazy checkpoint in rabit.
   */
--- a/include/xgboost/logging.h
+++ b/include/xgboost/logging.h
@@ -13,6 +13,7 @@

 #include <xgboost/base.h>
 #include <xgboost/parameter.h>
+#include <xgboost/global_config.h>

 #include <sstream>
 #include <map>
@@ -35,19 +36,6 @@ class BaseLogger {
  std::ostringstream log_stream_;
 };

-// Parsing both silent and debug_verbose is to provide backward compatibility.
-struct ConsoleLoggerParam : public XGBoostParameter<ConsoleLoggerParam> {
-  int verbosity;
-
-  DMLC_DECLARE_PARAMETER(ConsoleLoggerParam) {
-    DMLC_DECLARE_FIELD(verbosity)
-        .set_range(0, 3)
-        .set_default(1)  // shows only warning
-        .describe("Flag to print out detailed breakdown of runtime.");
-    DMLC_DECLARE_ALIAS(verbosity, debug_verbose);
-  }
-};
-
 class ConsoleLogger : public BaseLogger {
 public:
  enum class LogVerbosity {
@@ -60,9 +48,6 @@ class ConsoleLogger : public BaseLogger {
  using LV = LogVerbosity;

 private:
-  static LogVerbosity global_verbosity_;
-  static ConsoleLoggerParam param_;
-
  LogVerbosity cur_verbosity_;

 public:
--- a/include/xgboost/parameter.h
+++ b/include/xgboost/parameter.h
@@ -87,14 +87,11 @@ struct XGBoostParameter : public dmlc::Parameter<Type> {

 public:
  template <typename Container>
-  Args UpdateAllowUnknown(Container const& kwargs, bool* out_changed = nullptr) {
+  Args UpdateAllowUnknown(Container const& kwargs) {
    if (initialised_) {
-      return dmlc::Parameter<Type>::UpdateAllowUnknown(kwargs, out_changed);
+      return dmlc::Parameter<Type>::UpdateAllowUnknown(kwargs);
    } else {
      auto unknown = dmlc::Parameter<Type>::InitAllowUnknown(kwargs);
-      if (out_changed) {
-        *out_changed = true;
-      }
      initialised_ = true;
      return unknown;
    }
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2020 by Contributors
+ * Copyright 2017-2021 by Contributors
 * \file predictor.h
 * \brief Interface of predictor,
 *  performs predictions for a gradient booster.
@@ -119,6 +119,17 @@ class Predictor {
   */
  virtual void Configure(const std::vector<std::pair<std::string, std::string>>&);

+  /**
+   * \brief Initialize output prediction
+   *
+   * \param info Meta info for the DMatrix object used for prediction.
+   * \param out_predt Prediction vector to be initialized.
+   * \param model Tree model used for prediction.
+   */
+  virtual void InitOutPredictions(const MetaInfo &info,
+                                  HostDeviceVector<bst_float> *out_predt,
+                                  const gbm::GBTreeModel &model) const = 0;
+
  /**
   * \brief Generate batch predictions for a given feature matrix. May use
   * cached predictions if available instead of calculating from scratch.
@@ -127,12 +138,11 @@ class Predictor {
   * \param [in,out]  out_preds   The output preds.
   * \param           model       The model to predict from.
   * \param           tree_begin  The tree begin index.
-   * \param           ntree_limit (Optional) The ntree limit. 0 means do not
-   *                              limit trees.
+   * \param           tree_end    The tree end index.
   */
  virtual void PredictBatch(DMatrix* dmat, PredictionCacheEntry* out_preds,
-                            const gbm::GBTreeModel& model, int tree_begin,
-                            uint32_t const ntree_limit = 0) = 0;
+                            const gbm::GBTreeModel& model, uint32_t tree_begin,
+                            uint32_t tree_end = 0) const = 0;

  /**
   * \brief Inplace prediction.
@@ -140,12 +150,16 @@ class Predictor {
   * \param           model                  The model to predict from.
   * \param           missing                Missing value in the data.
   * \param [in,out]  out_preds              The output preds.
-   * \param           tree_begin (Optional) Begining of boosted trees used for prediction.
+   * \param           tree_begin (Optional) Beginning of boosted trees used for prediction.
   * \param           tree_end   (Optional) End of booster trees. 0 means do not limit trees.
+   *
+   * \return True if the data can be handled by current predictor, false otherwise.
   */
-  virtual void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
-                              float missing, PredictionCacheEntry *out_preds,
-                              uint32_t tree_begin = 0, uint32_t tree_end = 0) const = 0;
+  virtual bool InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
+                              const gbm::GBTreeModel &model, float missing,
+                              PredictionCacheEntry *out_preds,
+                              uint32_t tree_begin = 0,
+                              uint32_t tree_end = 0) const = 0;
  /**
   * \brief online prediction function, predict score for one instance at a time
   * NOTE: use the batch prediction interface if possible, batch prediction is
@@ -155,13 +169,13 @@ class Predictor {
   * \param           inst        The instance to predict.
   * \param [in,out]  out_preds   The output preds.
   * \param           model       The model to predict from
-   * \param           ntree_limit (Optional) The ntree limit.
+   * \param           tree_end    (Optional) The tree end index.
   */

  virtual void PredictInstance(const SparsePage::Inst& inst,
                               std::vector<bst_float>* out_preds,
                               const gbm::GBTreeModel& model,
-                               unsigned ntree_limit = 0) = 0;
+                               unsigned tree_end = 0) const = 0;

  /**
   * \brief predict the leaf index of each tree, the output will be nsample *
@@ -170,18 +184,14 @@ class Predictor {
   * \param [in,out]  dmat        The input feature matrix.
   * \param [in,out]  out_preds   The output preds.
   * \param           model       Model to make predictions from.
-   * \param           ntree_limit (Optional) The ntree limit.
+   * \param           tree_end    (Optional) The tree end index.
   */

  virtual void PredictLeaf(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                           const gbm::GBTreeModel& model,
-                           unsigned ntree_limit = 0) = 0;
+                           unsigned tree_end = 0) const = 0;

  /**
-   * \fn  virtual void Predictor::PredictContribution( DMatrix* dmat,
-   * std::vector<bst_float>* out_contribs, const gbm::GBTreeModel& model,
-   * unsigned ntree_limit = 0) = 0;
-   *
   * \brief feature contributions to individual predictions; the output will be
   * a vector of length (nfeats + 1) * num_output_group * nsample, arranged in
   * that order.
@@ -189,7 +199,7 @@ class Predictor {
   * \param [in,out]  dmat               The input feature matrix.
   * \param [in,out]  out_contribs       The output feature contribs.
   * \param           model              Model to make predictions from.
-   * \param           ntree_limit        (Optional) The ntree limit.
+   * \param           tree_end           The tree end index.
   * \param           tree_weights       (Optional) Weights to multiply each tree by.
   * \param           approximate        Use fast approximate algorithm.
   * \param           condition          Condition on the condition_feature (0=no, -1=cond off, 1=cond on).
@@ -199,18 +209,18 @@ class Predictor {
  virtual void PredictContribution(DMatrix* dmat,
                                   HostDeviceVector<bst_float>* out_contribs,
                                   const gbm::GBTreeModel& model,
-                                   unsigned ntree_limit = 0,
+                                   unsigned tree_end = 0,
                                   std::vector<bst_float>* tree_weights = nullptr,
                                   bool approximate = false,
                                   int condition = 0,
-                                   unsigned condition_feature = 0) = 0;
+                                   unsigned condition_feature = 0) const = 0;

  virtual void PredictInteractionContributions(DMatrix* dmat,
                                               HostDeviceVector<bst_float>* out_contribs,
                                               const gbm::GBTreeModel& model,
-                                               unsigned ntree_limit = 0,
+                                               unsigned tree_end = 0,
                                               std::vector<bst_float>* tree_weights = nullptr,
-                                               bool approximate = false) = 0;
+                                               bool approximate = false) const = 0;


  /**
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -38,6 +38,10 @@
 #include <type_traits>
 #include <cstdio>

+#if defined(__CUDACC__)
+#include <cuda_runtime.h>
+#endif  // defined(__CUDACC__)
+
 /*!
 * The version number 1910 is picked up from GSL.
 *
@@ -71,45 +75,46 @@
 namespace xgboost {
 namespace common {

+#if defined(__CUDA_ARCH__)
 // Usual logging facility is not available inside device code.
-// assert is not supported in mac as of CUDA 10.0
+
+#if defined(_MSC_VER)
+
+// Windows CUDA doesn't have __assert_fail.
 #define KERNEL_CHECK(cond)                                                     \
  do {                                                                         \
-    if (!(cond)) {                                                             \
-      printf("\nKernel error:\n"                                               \
-             "In: %s: %d\n"                                                    \
-             "\t%s\n\tExpecting: %s\n"                                         \
-             "\tBlock: [%d, %d, %d], Thread: [%d, %d, %d]\n\n",                \
-             __FILE__, __LINE__, __PRETTY_FUNCTION__, #cond, blockIdx.x,       \
-             blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);   \
+    if (XGBOOST_EXPECT(!(cond), false)) {                                      \
      asm("trap;");                                                            \
    }                                                                          \
-  } while (0);
+  } while (0)
+
+#else  // defined(_MSC_VER)
+
+#define __ASSERT_STR_HELPER(x) #x
+
+#define KERNEL_CHECK(cond)                                                     \
+  (XGBOOST_EXPECT((cond), true)                                                \
+       ? static_cast<void>(0)                                                  \
+       : __assert_fail(__ASSERT_STR_HELPER((cond)), __FILE__, __LINE__,        \
+                       __PRETTY_FUNCTION__))
+
+#endif  // defined(_MSC_VER)

-#if defined(__CUDA_ARCH__)
 #define SPAN_CHECK KERNEL_CHECK
-#elif defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1  // R package
-#define SPAN_CHECK CHECK  // check from dmlc
-#else  // not CUDA, not R
-#define SPAN_CHECK(cond)                                                       \
-  do {                                                                         \
-    if (XGBOOST_EXPECT(!(cond), false)) {                                      \
-      fprintf(stderr, "[xgboost] Condition %s failed.\n", #cond);              \
-      fflush(stderr);  /* It seems stderr on Windows is beffered? */           \
-      std::terminate();                                                        \
-    }                                                                          \
-  } while (0);
+
+#else  // not CUDA
+
+#define KERNEL_CHECK(cond)                                                     \
+  (XGBOOST_EXPECT((cond), true) ? static_cast<void>(0) : std::terminate())
+
+#define SPAN_CHECK(cond) KERNEL_CHECK(cond)
+
 #endif  // __CUDA_ARCH__

 #if defined(__CUDA_ARCH__)
-#define SPAN_LT(lhs, rhs)                                                      \
-  if (!((lhs) < (rhs))) {                                                      \
-    printf("[xgboost] Condition: %lu < %lu failed\n",                          \
-           static_cast<size_t>(lhs), static_cast<size_t>(rhs));                \
-    asm("trap;");                                                              \
-  }
+#define SPAN_LT(lhs, rhs) KERNEL_CHECK((lhs) < (rhs))
 #else
-#define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs))
+#define SPAN_LT(lhs, rhs) KERNEL_CHECK((lhs) < (rhs))
 #endif  // defined(__CUDA_ARCH__)

 namespace detail {
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -70,11 +70,14 @@ class TreeUpdater : public Configurable {
   *         the prediction cache. If true, the prediction cache will have been
   *         updated by the time this function returns.
   */
-  virtual bool UpdatePredictionCache(const DMatrix* data,
-                                     HostDeviceVector<bst_float>* out_preds) {
-    // Remove unused parameter compiler warning.
-    (void) data;
-    (void) out_preds;
+  virtual bool UpdatePredictionCache(const DMatrix* /*data*/,
+                                     HostDeviceVector<bst_float>* /*out_preds*/) {
+    return false;
+  }
+
+  virtual bool UpdatePredictionCacheMulticlass(const DMatrix* /*data*/,
+                                               HostDeviceVector<bst_float>* /*out_preds*/,
+                                               const int /*gid*/, const int /*ngroup*/) {
    return false;
  }

--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -5,7 +5,7 @@
 #define XGBOOST_VERSION_CONFIG_H_

 #define XGBOOST_VER_MAJOR 1
-#define XGBOOST_VER_MINOR 3
-#define XGBOOST_VER_PATCH 0
+#define XGBOOST_VER_MINOR 4
+#define XGBOOST_VER_PATCH 2

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -3,6 +3,7 @@ import errno
 import argparse
 import glob
 import os
+import platform
 import shutil
 import subprocess
 import sys
@@ -83,8 +84,9 @@ if __name__ == "__main__":

    print("building Java wrapper")
    with cd(".."):
-        maybe_makedirs("build")
-        with cd("build"):
+        build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build'
+        maybe_makedirs(build_dir)
+        with cd(build_dir):
            if sys.platform == "win32":
                # Force x64 build on Windows.
                maybe_generator = ' -A x64'
@@ -113,6 +115,9 @@ if __name__ == "__main__":
            if gpu_arch_flag is not None:
                args.append("%s" % gpu_arch_flag)

+            lib_dir = os.path.join(os.pardir, 'lib')
+            if os.path.exists(lib_dir):
+                shutil.rmtree(lib_dir)
            run("cmake .. " + " ".join(args) + maybe_generator)
            run("cmake --build . --config Release" + maybe_parallel_build)

@@ -124,13 +129,23 @@ if __name__ == "__main__":
    xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark'

    print("copying native library")
-    library_name = {
-        "win32": "xgboost4j.dll",
-        "darwin": "libxgboost4j.dylib",
-        "linux": "libxgboost4j.so"
-    }[sys.platform]
-    maybe_makedirs("{}/src/main/resources/lib".format(xgboost4j))
-    cp("../lib/" + library_name, "{}/src/main/resources/lib".format(xgboost4j))
+    library_name, os_folder = {
+        "Windows": ("xgboost4j.dll", "windows"),
+        "Darwin": ("libxgboost4j.dylib", "macos"),
+        "Linux": ("libxgboost4j.so", "linux"),
+        "SunOS": ("libxgboost4j.so", "solaris"),
+    }[platform.system()]
+    arch_folder = {
+        "x86_64": "x86_64",  # on Linux & macOS x86_64
+        "amd64": "x86_64",  # on Windows x86_64
+        "i86pc": "x86_64",  # on Solaris x86_64
+        "sun4v": "sparc",  # on Solaris sparc
+        "arm64": "aarch64",  # on macOS & Windows ARM 64-bit
+        "aarch64": "aarch64"
+    }[platform.machine().lower()]
+    output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder)
+    maybe_makedirs(output_folder)
+    cp("../lib/" + library_name, output_folder)

    print("copying pure-Python tracker")
    cp("../dmlc-core/tracker/dmlc_tracker/tracker.py",
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@

    <groupId>ml.dmlc</groupId>
    <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.3.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.3.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.3.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.3.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.3.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.3.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.3.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <build>
        <plugins>
            <plugin>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.3.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.3.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-gpu_2.12</artifactId>
-    <version>1.3.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/jvm-packages/xgboost4j-gpu/src
+++ b/jvm-packages/xgboost4j-gpu/src
@@ -1 +0,0 @@
-../xgboost4j/src/
--- a/jvm-packages/xgboost4j-gpu/src/main/java
+++ b/jvm-packages/xgboost4j-gpu/src/main/java
@@ -0,0 +1 @@
+../../../xgboost4j/src/main/java/
--- a/jvm-packages/xgboost4j-gpu/src/main/resources/xgboost4j-version.properties
+++ b/jvm-packages/xgboost4j-gpu/src/main/resources/xgboost4j-version.properties
@@ -0,0 +1 @@
+../../../../xgboost4j/src/main/resources/xgboost4j-version.properties
--- a/jvm-packages/xgboost4j-gpu/src/main/scala
+++ b/jvm-packages/xgboost4j-gpu/src/main/scala
@@ -0,0 +1 @@
+../../../xgboost4j/src/main/scala/
--- a/jvm-packages/xgboost4j-gpu/src/native
+++ b/jvm-packages/xgboost4j-gpu/src/native
@@ -0,0 +1 @@
+../../xgboost4j/src/native
--- a/jvm-packages/xgboost4j-gpu/src/test
+++ b/jvm-packages/xgboost4j-gpu/src/test
@@ -0,0 +1 @@
+../../xgboost4j/src/test
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.3.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>1.3.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark-gpu/src
+++ b/jvm-packages/xgboost4j-spark-gpu/src
@@ -1 +0,0 @@
-../xgboost4j-spark/src/
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala
@@ -0,0 +1 @@
+../../../xgboost4j-spark/src/main/scala
--- a/jvm-packages/xgboost4j-spark-gpu/src/test
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test
@@ -0,0 +1 @@
+../../xgboost4j-spark/src/test
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.3.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.3.0-SNAPSHOT</version>
+            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -149,7 +149,7 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
    overridedParams += "num_early_stopping_rounds" -> numEarlyStoppingRounds
    if (numEarlyStoppingRounds > 0 &&
      !overridedParams.contains("maximize_evaluation_metrics")) {
-      if (overridedParams.contains("custom_eval")) {
+      if (overridedParams.getOrElse("custom_eval", null) != null) {
        throw new IllegalArgumentException("custom_eval does not support early stopping")
      }
      val eval_metric = overridedParams("eval_metric").toString
@@ -613,8 +613,12 @@ object XGBoost extends Serializable {
          }
        }
        sparkJobThread.setUncaughtExceptionHandler(tracker)
-        sparkJobThread.start()
-        val trackerReturnVal = parallelismTracker.execute(tracker.waitFor(0L))
+
+        val trackerReturnVal = parallelismTracker.execute {
+          sparkJobThread.start()
+          tracker.waitFor(0L)
+        }
+
        logger.info(s"Rabit returns with exit code $trackerReturnVal")
        val (booster, metrics) = postTrackerReturnProcessing(trackerReturnVal,
          boostersAndMetrics, sparkJobThread)
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
@@ -78,4 +78,26 @@ class ParameterSuite extends FunSuite with PerTest with BeforeAndAfterAll {
      waitForSparkContextShutdown()
    }
  }
+
+  test("custom_eval does not support early stopping") {
+    val paramMap = Map("eta" -> "0.1", "custom_eval" -> new EvalError, "silent" -> "1",
+      "objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
+      "num_workers" -> numWorkers, "num_early_stopping_rounds" -> 2)
+    val trainingDF = buildDataFrame(MultiClassification.train)
+
+    val thrown = intercept[IllegalArgumentException] {
+      new XGBoostClassifier(paramMap).fit(trainingDF)
+    }
+
+    assert(thrown.getMessage.contains("custom_eval does not support early stopping"))
+  }
+
+  test("early stopping should work without custom_eval setting") {
+    val paramMap = Map("eta" -> "0.1", "silent" -> "1",
+      "objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
+      "num_workers" -> numWorkers, "num_early_stopping_rounds" -> 2)
+    val trainingDF = buildDataFrame(MultiClassification.train)
+
+    new XGBoostClassifier(paramMap).fit(trainingDF)
+  }
 }
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.3.0-SNAPSHOT</version>
+        <version>1.4.2</version>
    </parent>
    <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.3.0-SNAPSHOT</version>
+    <version>1.4.2</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -62,31 +62,23 @@ public class Booster implements Serializable, KryoSerializable {
    if (modelPath == null) {
      throw new NullPointerException("modelPath : null");
    }
-    Booster ret = new Booster(new HashMap<String, Object>(), new DMatrix[0]);
+    Booster ret = new Booster(new HashMap<>(), new DMatrix[0]);
    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadModel(ret.handle, modelPath));
    return ret;
  }

  /**
-   * Load a new Booster model from a file opened as input stream.
-   * The assumption is the input stream only contains one XGBoost Model.
+   * Load a new Booster model from a byte array buffer.
+   * The assumption is the array only contains one XGBoost Model.
   * This can be used to load existing booster models saved by other xgboost bindings.
   *
-   * @param in The input stream of the file.
-   * @return The create boosted
+   * @param buffer The byte contents of the booster.
+   * @return The created boosted
   * @throws XGBoostError
-   * @throws IOException
   */
-  static Booster loadModel(InputStream in) throws XGBoostError, IOException {
-    int size;
-    byte[] buf = new byte[1<<20];
-    ByteArrayOutputStream os = new ByteArrayOutputStream();
-    while ((size = in.read(buf)) != -1) {
-      os.write(buf, 0, size);
-    }
-    in.close();
-    Booster ret = new Booster(new HashMap<String, Object>(), new DMatrix[0]);
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(ret.handle,os.toByteArray()));
+  static Booster loadModel(byte[] buffer) throws XGBoostError {
+    Booster ret = new Booster(new HashMap<>(), new DMatrix[0]);
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(ret.handle, buffer));
    return ret;
  }

--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014, 2021 by Contributors

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,13 @@
 */
 package ml.dmlc.xgboost4j.java;

-import java.io.*;
-import java.lang.reflect.Field;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Locale;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -30,17 +35,19 @@ class NativeLibLoader {
  private static final Log logger = LogFactory.getLog(NativeLibLoader.class);

  private static boolean initialized = false;
-  private static final String nativeResourcePath = "/lib/";
+  private static final String nativeResourcePath = "/lib";
  private static final String[] libNames = new String[]{"xgboost4j"};

  static synchronized void initXGBoost() throws IOException {
    if (!initialized) {
+      String platform = computePlatformArchitecture();
      for (String libName : libNames) {
        try {
-          String libraryFromJar = nativeResourcePath + System.mapLibraryName(libName);
-          loadLibraryFromJar(libraryFromJar);
+          String libraryPathInJar = nativeResourcePath + "/" +
+              platform + "/" + System.mapLibraryName(libName);
+          loadLibraryFromJar(libraryPathInJar);
        } catch (IOException ioe) {
-          logger.error("failed to load " + libName + " library from jar");
+          logger.error("failed to load " + libName + " library from jar for platform " + platform);
          throw ioe;
        }
      }
@@ -48,6 +55,44 @@ class NativeLibLoader {
    }
  }

+  /**
+   * Computes a String representing the path to look for.
+   * Assumes the libraries are stored in the jar in os/architecture folders.
+   * <p>
+   * Throws IllegalStateException if the architecture or OS is unsupported.
+   * Supported OS: macOS, Windows, Linux, Solaris.
+   * Supported Architectures: x86_64, aarch64, sparc.
+   * @return The platform & architecture path.
+   */
+  private static String computePlatformArchitecture() {
+    String detectedOS;
+    String os = System.getProperty("os.name", "generic").toLowerCase(Locale.ENGLISH);
+    if (os.contains("mac") || os.contains("darwin")) {
+      detectedOS = "macos";
+    } else if (os.contains("win")) {
+      detectedOS = "windows";
+    } else if (os.contains("nux")) {
+      detectedOS = "linux";
+    } else if (os.contains("sunos")) {
+      detectedOS = "solaris";
+    } else {
+      throw new IllegalStateException("Unsupported os:" + os);
+    }
+    String detectedArch;
+    String arch = System.getProperty("os.arch", "generic").toLowerCase(Locale.ENGLISH);
+    if (arch.startsWith("amd64") || arch.startsWith("x86_64")) {
+      detectedArch = "x86_64";
+    } else if (arch.startsWith("aarch64") || arch.startsWith("arm64")) {
+      detectedArch = "aarch64";
+    } else if (arch.startsWith("sparc")) {
+      detectedArch = "sparc";
+    } else {
+      throw new IllegalStateException("Unsupported architecture:" + arch);
+    }
+
+    return detectedOS + "/" + detectedArch;
+  }
+
  /**
   * Loads library from current JAR archive
   * <p/>
@@ -65,9 +110,8 @@ class NativeLibLoader {
   * @throws IllegalArgumentException If the path is not absolute or if the filename is shorter than
   * three characters
   */
-  private static void loadLibraryFromJar(String path) throws IOException, IllegalArgumentException{
+  private static void loadLibraryFromJar(String path) throws IOException, IllegalArgumentException {
    String temp = createTempFileFromResource(path);
-    // Finally, load the library
    System.load(temp);
  }

@@ -82,8 +126,8 @@ class NativeLibLoader {
   * {@code path}.
   * @param path Path to the resources in the jar
   * @return The created temp file.
-   * @throws IOException
-   * @throws IllegalArgumentException
+   * @throws IOException If it failed to read the file.
+   * @throws IllegalArgumentException If the filename is invalid.
   */
  static String createTempFileFromResource(String path) throws
          IOException, IllegalArgumentException {
@@ -95,7 +139,7 @@ class NativeLibLoader {
    String[] parts = path.split("/");
    String filename = (parts.length > 1) ? parts[parts.length - 1] : null;

-    // Split filename to prexif and suffix (extension)
+    // Split filename to prefix and suffix (extension)
    String prefix = "";
    String suffix = null;
    if (filename != null) {
@@ -121,22 +165,18 @@ class NativeLibLoader {
    int readBytes;

    // Open and check input stream
-    InputStream is = NativeLibLoader.class.getResourceAsStream(path);
-    if (is == null) {
-      throw new FileNotFoundException("File " + path + " was not found inside JAR.");
-    }
+    try (InputStream is = NativeLibLoader.class.getResourceAsStream(path);
+         OutputStream os = new FileOutputStream(temp)) {
+      if (is == null) {
+        throw new FileNotFoundException("File " + path + " was not found inside JAR.");
+      }

-    // Open output stream and copy data between source file in JAR and the temporary file
-    OutputStream os = new FileOutputStream(temp);
-    try {
+      // Open output stream and copy data between source file in JAR and the temporary file
      while ((readBytes = is.read(buffer)) != -1) {
        os.write(buffer, 0, readBytes);
      }
-    } finally {
-      // If read/write fails, close streams safely before throwing an exception
-      os.close();
-      is.close();
    }
+
    return temp.getAbsolutePath();
  }

--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -15,10 +15,7 @@
 */
 package ml.dmlc.xgboost4j.java;

-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
+import java.io.*;
 import java.util.*;

 import org.apache.commons.logging.Log;
@@ -56,9 +53,28 @@ public class XGBoost {
   * @throws XGBoostError
   * @throws IOException
   */
-  public static Booster loadModel(InputStream in)
-          throws XGBoostError, IOException {
-    return Booster.loadModel(in);
+  public static Booster loadModel(InputStream in) throws XGBoostError, IOException {
+    int size;
+    byte[] buf = new byte[1<<20];
+    ByteArrayOutputStream os = new ByteArrayOutputStream();
+    while ((size = in.read(buf)) != -1) {
+      os.write(buf, 0, size);
+    }
+    in.close();
+    return Booster.loadModel(buf);
+  }
+
+  /**
+   * Load a new Booster model from a byte array buffer.
+   * The assumption is the array only contains one XGBoost Model.
+   * This can be used to load existing booster models saved by other xgboost bindings.
+   *
+   * @param buffer The byte contents of the booster.
+   * @return The create boosted
+   * @throws XGBoostError
+   */
+  public static Booster loadModel(byte[] buffer) throws XGBoostError, IOException {
+    return Booster.loadModel(buffer);
  }

  /**
--- a/plugin/lz4/sparse_page_lz4_format.cc
+++ b/plugin/lz4/sparse_page_lz4_format.cc
@@ -250,14 +250,18 @@ class SparsePageLZ4Format : public SparsePageFormat<SparsePage> {
    int nindex = index_.num_chunk();
    int nvalue = value_.num_chunk();
    int ntotal = nindex + nvalue;
-    #pragma omp parallel for schedule(dynamic, 1)  num_threads(nthread_write_)
+    dmlc::OMPException exc;
+    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_write_)
    for (int i = 0; i < ntotal; ++i) {
-      if (i < nindex) {
-        index_.Compress(i, use_lz4_hc_);
-      } else {
-        value_.Compress(i - nindex, use_lz4_hc_);
-      }
+      exc.Run([&]() {
+        if (i < nindex) {
+          index_.Compress(i, use_lz4_hc_);
+        } else {
+          value_.Compress(i - nindex, use_lz4_hc_);
+        }
+      });
    }
+    exc.Rethrow();
    index_.Write(fo);
    value_.Write(fo);
    // statistics
@@ -276,14 +280,18 @@ class SparsePageLZ4Format : public SparsePageFormat<SparsePage> {
    int nindex = index_.num_chunk();
    int nvalue = value_.num_chunk();
    int ntotal = nindex + nvalue;
+    dmlc::OMPException exc;
    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
    for (int i = 0; i < ntotal; ++i) {
-      if (i < nindex) {
-        index_.Decompress(i);
-      } else {
-        value_.Decompress(i - nindex);
-      }
+      exc.Run([&]() {
+        if (i < nindex) {
+          index_.Decompress(i);
+        } else {
+          value_.Decompress(i - nindex);
+        }
+      });
    }
+    exc.Rethrow();
  }

 private:
--- a/plugin/updater_oneapi/regression_loss_oneapi.h
+++ b/plugin/updater_oneapi/regression_loss_oneapi.h
@@ -134,7 +134,7 @@ struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
    predt = SigmoidOneAPI(predt);
    return std::max(predt * (T(1.0f) - predt), eps);
  }
-  static const char* DefaultEvalMetric() { return "auc"; }
+  static const char* DefaultEvalMetric() { return "logloss"; }

  static const char* Name() { return "binary:logitraw_oneapi"; }
 };
--- a/python-package/MANIFEST.in
+++ b/python-package/MANIFEST.in
@@ -1,11 +1,54 @@
-include *.rst
+include README.rst
+include xgboost/LICENSE
 include xgboost/VERSION
 include xgboost/CMakeLists.txt

-recursive-include xgboost *
+include xgboost/py.typed
+recursive-include xgboost *.py
+recursive-include xgboost/cmake *
+exclude xgboost/cmake/RPackageInstall.cmake.in
+exclude xgboost/cmake/RPackageInstallTargetSetup.cmake
+exclude xgboost/cmake/Sanitizer.cmake
+exclude xgboost/cmake/modules/FindASan.cmake
+exclude xgboost/cmake/modules/FindLSan.cmake
+exclude xgboost/cmake/modules/FindLibR.cmake
+exclude xgboost/cmake/modules/FindTSan.cmake
+exclude xgboost/cmake/modules/FindUBSan.cmake
 recursive-include xgboost/include *
+recursive-include xgboost/plugin *
 recursive-include xgboost/src *
-recursive-include xgboost/rabit *
-recursive-include xgboost/dmlc-core *

-global-exclude *.py[oc]
+include xgboost/rabit/CMakeLists.txt
+recursive-include xgboost/rabit/include *
+recursive-include xgboost/rabit/src *
+prune xgboost/rabit/doc
+prune xgboost/rabit/guide
+
+include xgboost/dmlc-core/CMakeLists.txt
+
+recursive-include xgboost/dmlc-core/cmake *
+exclude xgboost/dmlc-core/cmake/gtest_cmake.in
+exclude xgboost/dmlc-core/cmake/lint.cmake
+exclude xgboost/dmlc-core/cmake/Sanitizer.cmake
+exclude xgboost/dmlc-core/cmake/Modules/FindASan.cmake
+exclude xgboost/dmlc-core/cmake/Modules/FindLSan.cmake
+exclude xgboost/dmlc-core/cmake/Modules/FindTSan.cmake
+exclude xgboost/dmlc-core/cmake/Modules/FindUBSan.cmake
+
+recursive-include xgboost/dmlc-core/include *
+recursive-include xgboost/dmlc-core/include *
+recursive-include xgboost/dmlc-core/make *
+recursive-include xgboost/dmlc-core/src *
+include xgboost/dmlc-core/tracker/dmlc-submit
+recursive-include xgboost/dmlc-core/tracker/dmlc_tracker *.py
+include xgboost/dmlc-core/tracker/yarn/build.bat
+include xgboost/dmlc-core/tracker/yarn/build.sh
+include xgboost/dmlc-core/tracker/yarn/pom.xml
+recursive-include xgboost/dmlc-core/tracker/yarn/src *
+include xgboost/dmlc-core/windows/dmlc.sln
+include xgboost/dmlc-core/windows/dmlc/dmlc.vcxproj
+
+prune xgboost/dmlc-core/doc
+prune xgboost/dmlc-core/scripts/
+
+global-exclude *.py[oc]
--- a/python-package/setup.cfg
+++ b/python-package/setup.cfg
@@ -1,2 +1,6 @@
 [metadata]
 description-file = README.rst
+
+[mypy]
+ignore_missing_imports = True
+disallow_untyped_defs = True
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -105,11 +105,12 @@ class BuildExt(build_ext.build_ext):  # pylint: disable=too-many-ancestors
        for k, v in USER_OPTIONS.items():
            arg = k.replace('-', '_').upper()
            value = str(v[2])
+            if arg == 'USE_SYSTEM_LIBXGBOOST':
+                continue
+            if arg == 'USE_OPENMP' and use_omp == 0:
+                cmake_cmd.append("-D" + arg + "=0")
+                continue
            cmake_cmd.append('-D' + arg + '=' + value)
-            if k == 'USE-SYSTEM-LIBXGBOOST':
-                continue
-            if k == 'USE_OPENMP' and use_omp == 0:
-                continue

        self.logger.info('Run CMake command: %s', str(cmake_cmd))
        subprocess.check_call(cmake_cmd, cwd=build_dir)
@@ -128,12 +129,6 @@ class BuildExt(build_ext.build_ext):  # pylint: disable=too-many-ancestors
            self.logger.info('Using system libxgboost.')
            return

-        src_dir = 'xgboost'
-        try:
-            copy_tree(os.path.join(CURRENT_DIR, os.path.pardir),
-                      os.path.join(self.build_temp, src_dir))
-        except Exception:  # pylint: disable=broad-except
-            copy_tree(src_dir, os.path.join(self.build_temp, src_dir))
        build_dir = self.build_temp
        global BUILD_TEMP_DIR  # pylint: disable=global-statement
        BUILD_TEMP_DIR = build_dir
@@ -144,6 +139,13 @@ class BuildExt(build_ext.build_ext):  # pylint: disable=too-many-ancestors
            self.logger.info('Found shared library, skipping build.')
            return

+        src_dir = 'xgboost'
+        try:
+            copy_tree(os.path.join(CURRENT_DIR, os.path.pardir),
+                      os.path.join(self.build_temp, src_dir))
+        except Exception:  # pylint: disable=broad-except
+            copy_tree(src_dir, os.path.join(self.build_temp, src_dir))
+
        self.logger.info('Building from source. %s', libxgboost)
        if not os.path.exists(build_dir):
            os.mkdir(build_dir)
@@ -305,6 +307,7 @@ if __name__ == '__main__':
          description="XGBoost Python Package",
          long_description=open(os.path.join(CURRENT_DIR, 'README.rst'),
                                encoding='utf-8').read(),
+          long_description_content_type="text/x-rst",
          install_requires=[
              'numpy',
              'scipy',
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-1.3.0-SNAPSHOT
+1.4.2
--- a/python-package/xgboost/init.py
+++ b/python-package/xgboost/init.py
@@ -17,6 +17,7 @@ try:
    from .sklearn import XGBModel, XGBClassifier, XGBRegressor, XGBRanker
    from .sklearn import XGBRFClassifier, XGBRFRegressor
    from .plotting import plot_importance, plot_tree, to_graphviz
+    from .config import set_config, get_config, config_context
 except ImportError:
    pass

@@ -29,4 +30,5 @@ __all__ = ['DMatrix', 'DeviceQuantileDMatrix', 'Booster',
           'RabitTracker',
           'XGBModel', 'XGBClassifier', 'XGBRegressor', 'XGBRanker',
           'XGBRFClassifier', 'XGBRFRegressor',
-           'plot_importance', 'plot_tree', 'to_graphviz', 'dask']
+           'plot_importance', 'plot_tree', 'to_graphviz', 'dask',
+           'set_config', 'get_config', 'config_context']
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`../../../../xgboost4j/src/main/resources/xgboost4j-version.properties`