sync Jun 5

2023-06-07 02:43:21 +02:00 · 2023-06-07 02:43:21 +02:00 · af8845405a
commit af8845405a
parent 9ee1852d4e 0cba2cdbb0
56 changed files with 531 additions and 2106 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -9,85 +9,23 @@ updates:
    directory: "/jvm-packages"
    schedule:
      interval: "daily"
    ignore:
      # Pin Scala version to 2.12.x
      - dependency-name: "org.scala-lang:scala-compiler"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-reflect"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-library"
        versions: [">= 2.13.0"]
  - package-ecosystem: "maven"
    directory: "/jvm-packages/xgboost4j"
    schedule:
      interval: "daily"
    ignore:
      # Pin Scala version to 2.12.x
      - dependency-name: "org.scala-lang:scala-compiler"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-reflect"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-library"
        versions: [">= 2.13.0"]
  - package-ecosystem: "maven"
    directory: "/jvm-packages/xgboost4j-gpu"
    schedule:
      interval: "daily"
    ignore:
      # Pin Scala version to 2.12.x
      - dependency-name: "org.scala-lang:scala-compiler"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-reflect"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-library"
        versions: [">= 2.13.0"]
  - package-ecosystem: "maven"
    directory: "/jvm-packages/xgboost4j-example"
    schedule:
      interval: "daily"
    ignore:
      # Pin Scala version to 2.12.x
      - dependency-name: "org.scala-lang:scala-compiler"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-reflect"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-library"
        versions: [">= 2.13.0"]
  - package-ecosystem: "maven"
    directory: "/jvm-packages/xgboost4j-spark"
    schedule:
      interval: "daily"
    ignore:
      # Pin Scala version to 2.12.x
      - dependency-name: "org.scala-lang:scala-compiler"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-reflect"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-library"
        versions: [">= 2.13.0"]
      # Pin Spark version to 3.0.x
      - dependency-name: "org.apache.spark:spark-core_2.12"
        versions: [">= 3.1.0"]
      - dependency-name: "org.apache.spark:spark-sql_2.12"
        versions: [">= 3.1.0"]
      - dependency-name: "org.apache.spark:spark-mllib_2.12"
        versions: [">= 3.1.0"]
  - package-ecosystem: "maven"
    directory: "/jvm-packages/xgboost4j-spark-gpu"
    schedule:
      interval: "daily"
    ignore:
      # Pin Scala version to 2.12.x
      - dependency-name: "org.scala-lang:scala-compiler"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-reflect"
        versions: [">= 2.13.0"]
      - dependency-name: "org.scala-lang:scala-library"
        versions: [">= 2.13.0"]
      # Pin Spark version to 3.0.x
      - dependency-name: "org.apache.spark:spark-core_2.12"
        versions: [">= 3.1.0"]
      - dependency-name: "org.apache.spark:spark-sql_2.12"
        versions: [">= 3.1.0"]
      - dependency-name: "org.apache.spark:spark-mllib_2.12"
        versions: [">= 3.1.0"]
--- a/.github/workflows/update_rapids.yml
+++ b/.github/workflows/update_rapids.yml
@ -0,0 +1,40 @@
 name: update-rapids
 on:
  schedule:
    - cron: "20 20 * * *"  # Run once daily
 permissions:
  contents: read  # to fetch code (actions/checkout)
 defaults:
  run:
    shell: bash -l {0}
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # To use GitHub CLI
 jobs:
  update-rapids:
    name: Check latest RAPIDS
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
      with:
        submodules: 'true'
    - name: Check latest RAPIDS and update conftest.sh
      run: |
        bash tests/buildkite/update-rapids.sh
    - name: Create Pull Request
      uses: peter-evans/create-pull-request@v5
      if: github.ref == 'refs/heads/master'
      with:
        add-paths: |
          tests/buildkite
        branch: create-pull-request/update-rapids
        base: master
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@ -319,7 +319,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
    # maximize is usually NULL when not set in xgb.train and built-in metrics
    if (is.null(maximize))
-      maximize <<- grepl('(_auc|_map|_ndcg)', metric_name)
+      maximize <<- grepl('(_auc|_map|_ndcg|_pre)', metric_name)
    if (verbose && NVL(env$rank, 0) == 0)
      cat("Will train until ", metric_name, " hasn't improved in ",
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 <img src="https://xgboost.ai/images/logo/xgboost-logo.svg" width=135/>  eXtreme Gradient Boosting
 ===========
-[![Build Status](https://xgboost-ci.net/job/xgboost/job/master/badge/icon)](https://xgboost-ci.net/blue/organizations/jenkins/xgboost/activity)
+[![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -424,6 +424,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
      After XGBoost 1.6, both of the requirements and restrictions for using ``aucpr`` in classification problem are similar to ``auc``.  For ranking task, only binary relevance label :math:`y \in [0, 1]` is supported.  Different from ``map (mean average precision)``, ``aucpr`` calculates the *interpolated* area under precision recall curve using continuous interpolation.
    - ``pre``: Precision at :math:`k`. Supports only learning to rank task.
    - ``ndcg``: `Normalized Discounted Cumulative Gain <http://en.wikipedia.org/wiki/NDCG>`_
    - ``map``: `Mean Average Precision <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_
@ -435,7 +436,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
      where :math:`I_{(k)}` is an indicator function that equals to :math:`1` when the document at :math:`k` is relevant and :math:`0` otherwise. The :math:`P@k` is the precision at :math:`k`, and :math:`N` is the total number of relevant documents. Lastly, the `mean average precision` is defined as the weighted average across all queries.
-    - ``ndcg@n``, ``map@n``: :math:`n` can be assigned as an integer to cut off the top positions in the lists for evaluation.
+    - ``ndcg@n``, ``map@n``, ``pre@n``: :math:`n` can be assigned as an integer to cut off the top positions in the lists for evaluation.
    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as :math:`1`. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as :math:`0` to be consistent under some conditions.
    - ``poisson-nloglik``: negative log-likelihood for Poisson regression
    - ``gamma-nloglik``: negative log-likelihood for gamma regression
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@ -134,16 +134,18 @@ class Predictor {
   * usually more efficient than online prediction This function is NOT
   * threadsafe, make sure you only call from one thread.
   *
-   * \param           inst        The instance to predict.
+   * \param           inst            The instance to predict.
-   * \param [in,out]  out_preds   The output preds.
+   * \param [in,out]  out_preds       The output preds.
-   * \param           model       The model to predict from
+   * \param           model           The model to predict from
-   * \param           tree_end    (Optional) The tree end index.
+   * \param           tree_end        (Optional) The tree end index.
   * \param           is_column_split (Optional) If the data is split column-wise.
   */
  virtual void PredictInstance(const SparsePage::Inst& inst,
                               std::vector<bst_float>* out_preds,
                               const gbm::GBTreeModel& model,
-                               unsigned tree_end = 0) const = 0;
+                               unsigned tree_end = 0,
                               bool is_column_split = false) const = 0;
  /**
   * \brief predict the leaf index of each tree, the output will be nsample *
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@ -1,5 +1,5 @@
 # XGBoost4J: Distributed XGBoost for Scala/Java
-[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost)
+[![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org/en/latest/jvm/index.html)
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](../LICENSE)
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -130,7 +130,7 @@
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-release-plugin</artifactId>
-                        <version>3.0.0</version>
+                        <version>3.0.1</version>
                        <configuration>
                            <autoVersionSubmodules>true</autoVersionSubmodules>
                            <useReleaseProfile>false</useReleaseProfile>
@ -301,14 +301,6 @@
                    <url>https://s3.amazonaws.com/xgboost-maven-repo/release</url>
                </repository>
            </repositories>
            <modules>
                <module>xgboost4j</module>
                <module>xgboost4j-example</module>
                <module>xgboost4j-spark</module>
                <module>xgboost4j-flink</module>
                <module>xgboost4j-gpu</module>
                <module>xgboost4j-spark-gpu</module>
            </modules>
            <build>
                <plugins>
                    <plugin>
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@ -372,6 +372,8 @@ class EarlyStopping(TrainingCallback):
            maximize_metrics = (
                "auc",
                "aucpr",
                "pre",
                "pre@",
                "map",
                "ndcg",
                "auc@",
--- a/python-package/xgboost/testing/metrics.py
+++ b/python-package/xgboost/testing/metrics.py
@ -1,9 +1,61 @@
 """Tests for evaluation metrics."""
-from typing import Dict
+from typing import Dict, List
 import numpy as np
 import pytest
 import xgboost as xgb
 from xgboost.compat import concat
 from xgboost.core import _parse_eval_str
 def check_precision_score(tree_method: str) -> None:
    """Test for precision with ranking and classification."""
    datasets = pytest.importorskip("sklearn.datasets")
    X, y = datasets.make_classification(
        n_samples=1024, n_features=4, n_classes=2, random_state=2023
    )
    qid = np.zeros(shape=y.shape)  # same group
    ltr = xgb.XGBRanker(n_estimators=2, tree_method=tree_method)
    ltr.fit(X, y, qid=qid)
    # re-generate so that XGBoost doesn't evaluate the result to 1.0
    X, y = datasets.make_classification(
        n_samples=512, n_features=4, n_classes=2, random_state=1994
    )
    ltr.set_params(eval_metric="pre@32")
    result = _parse_eval_str(
        ltr.get_booster().eval_set(evals=[(xgb.DMatrix(X, y), "Xy")])
    )
    score_0 = result[1][1]
    X_list = []
    y_list = []
    n_query_groups = 3
    q_list: List[np.ndarray] = []
    for i in range(n_query_groups):
        # same for all groups
        X, y = datasets.make_classification(
            n_samples=512, n_features=4, n_classes=2, random_state=1994
        )
        X_list.append(X)
        y_list.append(y)
        q = np.full(shape=y.shape, fill_value=i, dtype=np.uint64)
        q_list.append(q)
    qid = concat(q_list)
    X = concat(X_list)
    y = concat(y_list)
    result = _parse_eval_str(
        ltr.get_booster().eval_set(evals=[(xgb.DMatrix(X, y, qid=qid), "Xy")])
    )
    assert result[1][0].endswith("pre@32")
    score_1 = result[1][1]
    assert score_1 == score_0
 def check_quantile_error(tree_method: str) -> None:
--- a/rabit/.gitignore
+++ b/rabit/.gitignore
@ -1,52 +0,0 @@
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 *.lnk
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.miss
 *.exe
 *.out
 *.app
 *~
 *.pyc
 *.mpi
 *.exe
 *tmp*
 *.rabit
 *.mock
 recommonmark
 recom
 _*
 #mpi lib
 mpich/
 mpich-3.2/
 # Jetbrain
 .idea
 cmake-build-debug/
 .vscode/
 # cmake
 build/
 compile_commands.json
--- a/rabit/doc/.gitignore
+++ b/rabit/doc/.gitignore
@ -1,5 +0,0 @@
 html
 latex
 *.sh
 _*
 doxygen
--- a/rabit/doc/Doxyfile
+++ b/rabit/doc/Doxyfile
@ -1,281 +0,0 @@
 # Doxyfile 1.7.6.1
 #---------------------------------------------------------------------------
 # Project related configuration options
 #---------------------------------------------------------------------------
 DOXYFILE_ENCODING      = UTF-8
 PROJECT_NAME           = "rabit"
 PROJECT_NUMBER         =
 PROJECT_BRIEF          =
 PROJECT_LOGO           =
 OUTPUT_DIRECTORY       = ../doc/doxygen
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
 BRIEF_MEMBER_DESC      = YES
 REPEAT_BRIEF           = YES
 ABBREVIATE_BRIEF       =
 ALWAYS_DETAILED_SEC    = NO
 INLINE_INHERITED_MEMB  = NO
 FULL_PATH_NAMES        = YES
 STRIP_FROM_PATH        =
 STRIP_FROM_INC_PATH    =
 SHORT_NAMES            = NO
 JAVADOC_AUTOBRIEF      = NO
 QT_AUTOBRIEF           = NO
 MULTILINE_CPP_IS_BRIEF = NO
 INHERIT_DOCS           = YES
 SEPARATE_MEMBER_PAGES  = NO
 TAB_SIZE               = 8
 ALIASES                =
 TCL_SUBST              =
 OPTIMIZE_OUTPUT_FOR_C  = YES
 OPTIMIZE_OUTPUT_JAVA   = NO
 OPTIMIZE_FOR_FORTRAN   = NO
 OPTIMIZE_OUTPUT_VHDL   = NO
 EXTENSION_MAPPING      =
 BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 SIP_SUPPORT            = NO
 IDL_PROPERTY_SUPPORT   = YES
 DISTRIBUTE_GROUP_DOC   = NO
 SUBGROUPING            = YES
 INLINE_GROUPED_CLASSES = NO
 INLINE_SIMPLE_STRUCTS  = NO
 TYPEDEF_HIDES_STRUCT   = NO
 LOOKUP_CACHE_SIZE      = 0
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
 EXTRACT_ALL            = NO
 EXTRACT_PRIVATE        = NO
 EXTRACT_STATIC         = NO
 EXTRACT_LOCAL_CLASSES  = YES
 EXTRACT_LOCAL_METHODS  = NO
 EXTRACT_ANON_NSPACES   = NO
 HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = YES
 HIDE_FRIEND_COMPOUNDS  = NO
 HIDE_IN_BODY_DOCS      = NO
 INTERNAL_DOCS          = NO
 CASE_SENSE_NAMES       = YES
 HIDE_SCOPE_NAMES       = NO
 SHOW_INCLUDE_FILES     = YES
 FORCE_LOCAL_INCLUDES   = NO
 INLINE_INFO            = YES
 SORT_MEMBER_DOCS       = YES
 SORT_BRIEF_DOCS        = NO
 SORT_MEMBERS_CTORS_1ST = NO
 SORT_GROUP_NAMES       = NO
 SORT_BY_SCOPE_NAME     = NO
 STRICT_PROTO_MATCHING  = NO
 GENERATE_TODOLIST      = YES
 GENERATE_TESTLIST      = YES
 GENERATE_BUGLIST       = YES
 GENERATE_DEPRECATEDLIST= YES
 ENABLED_SECTIONS       =
 MAX_INITIALIZER_LINES  = 30
 SHOW_USED_FILES        = YES
 SHOW_FILES             = YES
 SHOW_NAMESPACES        = YES
 FILE_VERSION_FILTER    =
 LAYOUT_FILE            =
 CITE_BIB_FILES         =
 #---------------------------------------------------------------------------
 # configuration options related to warning and progress messages
 #---------------------------------------------------------------------------
 QUIET                  = NO
 WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 WARN_IF_DOC_ERROR      = YES
 WARN_NO_PARAMDOC       = YES
 WARN_FORMAT            = "$file:$line: $text"
 WARN_LOGFILE           =
 #---------------------------------------------------------------------------
 # configuration options related to the input files
 #---------------------------------------------------------------------------
 INPUT                  = rabit
 INPUT_ENCODING         = UTF-8
 FILE_PATTERNS          =
 RECURSIVE              = NO
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
 EXCLUDE_PATTERNS       = *-inl.hpp
 EXCLUDE_SYMBOLS        =
 EXAMPLE_PATH           =
 EXAMPLE_PATTERNS       =
 EXAMPLE_RECURSIVE      = NO
 IMAGE_PATH             =
 INPUT_FILTER           =
 FILTER_PATTERNS        =
 FILTER_SOURCE_FILES    = NO
 FILTER_SOURCE_PATTERNS =
 #---------------------------------------------------------------------------
 # configuration options related to source browsing
 #---------------------------------------------------------------------------
 SOURCE_BROWSER         = NO
 INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 REFERENCED_BY_RELATION = NO
 REFERENCES_RELATION    = NO
 REFERENCES_LINK_SOURCE = YES
 USE_HTAGS              = NO
 VERBATIM_HEADERS       = YES
 #---------------------------------------------------------------------------
 # configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
 ALPHABETICAL_INDEX     = YES
 COLS_IN_ALPHA_INDEX    = 5
 IGNORE_PREFIX          =
 #---------------------------------------------------------------------------
 # configuration options related to the HTML output
 #---------------------------------------------------------------------------
 GENERATE_HTML          = YES
 HTML_OUTPUT            = html
 HTML_FILE_EXTENSION    = .html
 HTML_HEADER            =
 HTML_FOOTER            =
 HTML_STYLESHEET        =
 HTML_EXTRA_FILES       =
 HTML_COLORSTYLE_HUE    = 220
 HTML_COLORSTYLE_SAT    = 100
 HTML_COLORSTYLE_GAMMA  = 80
 HTML_TIMESTAMP         = YES
 HTML_DYNAMIC_SECTIONS  = NO
 GENERATE_DOCSET        = NO
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 DOCSET_BUNDLE_ID       = org.doxygen.Project
 DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
 DOCSET_PUBLISHER_NAME  = Publisher
 GENERATE_HTMLHELP      = NO
 CHM_FILE               =
 HHC_LOCATION           =
 GENERATE_CHI           = NO
 CHM_INDEX_ENCODING     =
 BINARY_TOC             = NO
 TOC_EXPAND             = NO
 GENERATE_QHP           = NO
 QCH_FILE               =
 QHP_NAMESPACE          = org.doxygen.Project
 QHP_VIRTUAL_FOLDER     = doc
 QHP_CUST_FILTER_NAME   =
 QHP_CUST_FILTER_ATTRS  =
 QHP_SECT_FILTER_ATTRS  =
 QHG_LOCATION           =
 GENERATE_ECLIPSEHELP   = NO
 ECLIPSE_DOC_ID         = org.doxygen.Project
 DISABLE_INDEX          = NO
 GENERATE_TREEVIEW      = NO
 ENUM_VALUES_PER_LINE   = 4
 TREEVIEW_WIDTH         = 250
 EXT_LINKS_IN_WINDOW    = NO
 FORMULA_FONTSIZE       = 10
 FORMULA_TRANSPARENT    = YES
 USE_MATHJAX            = NO
 MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
 MATHJAX_EXTENSIONS     =
 SEARCHENGINE           = YES
 SERVER_BASED_SEARCH    = NO
 #---------------------------------------------------------------------------
 # configuration options related to the LaTeX output
 #---------------------------------------------------------------------------
 GENERATE_LATEX         = YES
 LATEX_OUTPUT           = latex
 LATEX_CMD_NAME         = latex
 MAKEINDEX_CMD_NAME     = makeindex
 COMPACT_LATEX          = NO
 PAPER_TYPE             = a4
 EXTRA_PACKAGES         =
 LATEX_HEADER           =
 LATEX_FOOTER           =
 PDF_HYPERLINKS         = YES
 USE_PDFLATEX           = YES
 LATEX_BATCHMODE        = NO
 LATEX_HIDE_INDICES     = NO
 LATEX_SOURCE_CODE      = NO
 LATEX_BIB_STYLE        = plain
 #---------------------------------------------------------------------------
 # configuration options related to the RTF output
 #---------------------------------------------------------------------------
 GENERATE_RTF           = NO
 RTF_OUTPUT             = rtf
 COMPACT_RTF            = NO
 RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 RTF_EXTENSIONS_FILE    =
 #---------------------------------------------------------------------------
 # configuration options related to the man page output
 #---------------------------------------------------------------------------
 GENERATE_MAN           = NO
 MAN_OUTPUT             = man
 MAN_EXTENSION          = .3
 MAN_LINKS              = NO
 #---------------------------------------------------------------------------
 # configuration options related to the XML output
 #---------------------------------------------------------------------------
 GENERATE_XML           = YES
 XML_OUTPUT             = xml
 XML_PROGRAMLISTING     = YES
 #---------------------------------------------------------------------------
 # configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 GENERATE_AUTOGEN_DEF   = NO
 #---------------------------------------------------------------------------
 # configuration options related to the Perl module output
 #---------------------------------------------------------------------------
 GENERATE_PERLMOD       = NO
 PERLMOD_LATEX          = NO
 PERLMOD_PRETTY         = YES
 PERLMOD_MAKEVAR_PREFIX =
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 ENABLE_PREPROCESSING   = NO
 MACRO_EXPANSION        = NO
 EXPAND_ONLY_PREDEF     = NO
 SEARCH_INCLUDES        = YES
 INCLUDE_PATH           =
 INCLUDE_FILE_PATTERNS  =
 PREDEFINED             =
 EXPAND_AS_DEFINED      =
 SKIP_FUNCTION_MACROS   = YES
 #---------------------------------------------------------------------------
 # Configuration::additions related to external references
 #---------------------------------------------------------------------------
 TAGFILES               =
 GENERATE_TAGFILE       =
 ALLEXTERNALS           = NO
 EXTERNAL_GROUPS        = YES
 PERL_PATH              = /usr/bin/perl
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 CLASS_DIAGRAMS         = YES
 MSCGEN_PATH            =
 HIDE_UNDOC_RELATIONS   = YES
 HAVE_DOT               = NO
 DOT_NUM_THREADS        = 0
 DOT_FONTNAME           = Helvetica
 DOT_FONTSIZE           = 10
 DOT_FONTPATH           =
 CLASS_GRAPH            = YES
 COLLABORATION_GRAPH    = YES
 GROUP_GRAPHS           = YES
 UML_LOOK               = NO
 TEMPLATE_RELATIONS     = NO
 INCLUDE_GRAPH          = YES
 INCLUDED_BY_GRAPH      = YES
 CALL_GRAPH             = NO
 CALLER_GRAPH           = NO
 GRAPHICAL_HIERARCHY    = YES
 DIRECTORY_GRAPH        = YES
 DOT_IMAGE_FORMAT       = png
 INTERACTIVE_SVG        = NO
 DOT_PATH               =
 DOTFILE_DIRS           =
 MSCFILE_DIRS           =
 DOT_GRAPH_MAX_NODES    = 50
 MAX_DOT_GRAPH_DEPTH    = 0
 DOT_TRANSPARENT        = NO
 DOT_MULTI_TARGETS      = YES
 GENERATE_LEGEND        = YES
 DOT_CLEANUP            = YES
--- a/rabit/doc/Makefile
+++ b/rabit/doc/Makefile
@ -1,192 +0,0 @@
 # Makefile for Sphinx documentation
 #
 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 endif
 # Internal variables.
 PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 # the i18n builder cannot share the environment and doctrees with the others
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 help:
 	@echo "Please use \`make <target>' where <target> is one of"
 	@echo "  html       to make standalone HTML files"
 	@echo "  dirhtml    to make HTML files named index.html in directories"
 	@echo "  singlehtml to make a single large HTML file"
 	@echo "  pickle     to make pickle files"
 	@echo "  json       to make JSON files"
 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 	@echo "  qthelp     to make HTML files and a qthelp project"
 	@echo "  applehelp  to make an Apple Help Book"
 	@echo "  devhelp    to make HTML files and a Devhelp project"
 	@echo "  epub       to make an epub"
 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 	@echo "  text       to make text files"
 	@echo "  man        to make manual pages"
 	@echo "  texinfo    to make Texinfo files"
 	@echo "  info       to make Texinfo files and run them through makeinfo"
 	@echo "  gettext    to make PO message catalogs"
 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 	@echo "  xml        to make Docutils-native XML files"
 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 	@echo "  linkcheck  to check all external links for integrity"
 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 clean:
 	rm -rf $(BUILDDIR)/*
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 dirhtml:
 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 singlehtml:
 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 	@echo
 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 pickle:
 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 	@echo
 	@echo "Build finished; now you can process the pickle files."
 json:
 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 	@echo
 	@echo "Build finished; now you can process the JSON files."
 htmlhelp:
 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 	@echo
 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 qthelp:
 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 	@echo
 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
 	@echo "To view the help file:"
 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
 applehelp:
 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 	@echo
 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 	@echo "N.B. You won't be able to view it unless you put it in" \
 	      "~/Library/Documentation/Help or install it in your application" \
 	      "bundle."
 devhelp:
 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 	@echo
 	@echo "Build finished."
 	@echo "To view the help file:"
 	@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
 	@echo "# devhelp"
 epub:
 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 	@echo
 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 latex:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo
 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 	      "(use \`make latexpdf' here to do that automatically)."
 latexpdf:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo "Running LaTeX files through pdflatex..."
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 latexpdfja:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo "Running LaTeX files through platex and dvipdfmx..."
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 text:
 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
 	@echo
 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
 man:
 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
 	@echo
 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
 texinfo:
 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
 	@echo
 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
 	@echo "Run \`make' in that directory to run these through makeinfo" \
 	      "(use \`make info' here to do that automatically)."
 info:
 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
 	@echo "Running Texinfo files through makeinfo..."
 	make -C $(BUILDDIR)/texinfo info
 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
 gettext:
 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
 	@echo
 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
 changes:
 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
 	@echo
 	@echo "The overview file is in $(BUILDDIR)/changes."
 linkcheck:
 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
 	@echo
 	@echo "Link check complete; look for any errors in the above output " \
 	      "or in $(BUILDDIR)/linkcheck/output.txt."
 doctest:
 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
 	@echo "Testing of doctests in the sources finished, look at the " \
 	      "results in $(BUILDDIR)/doctest/output.txt."
 coverage:
 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
 	@echo "Testing of coverage in the sources finished, look at the " \
 	      "results in $(BUILDDIR)/coverage/python.txt."
 xml:
 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
 	@echo
 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
 pseudoxml:
 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
 	@echo
 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
--- a/rabit/doc/conf.py
+++ b/rabit/doc/conf.py
@ -1,184 +0,0 @@
 # -*- coding: utf-8 -*-
 #
 # documentation build configuration file, created by
 # sphinx-quickstart on Thu Jul 23 19:40:08 2015.
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
 #
 # Note that not all possible configuration values are present in this
 # autogenerated file.
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 import sys
 import os, subprocess
 import shlex
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 libpath = os.path.join(curr_path, '../wrapper/')
 sys.path.insert(0, os.path.join(curr_path, '../wrapper/'))
 sys.path.insert(0, curr_path)
 from sphinx_util import MarkdownParser, AutoStructify
 # -- General configuration ------------------------------------------------
 # General information about the project.
 project = u'rabit'
 copyright = u'2015, rabit developers'
 author = u'rabit developers'
 github_doc_root = 'https://github.com/dmlc/rabit/tree/master/doc/'
 # add markdown parser
 MarkdownParser.github_doc_root = github_doc_root
 source_parsers = {
    '.md': MarkdownParser,
 }
 # Version information.
 import rabit
 version = rabit.__version__
 release = rabit.__version__
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.napoleon',
    'sphinx.ext.mathjax',
    'breathe',
 ]
 # Use breathe to include doxygen documents
 breathe_projects = {'rabit' : 'doxygen/xml/'}
 breathe_default_project = 'rabit'
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 # source_suffix = ['.rst', '.md']
 source_suffix = ['.rst', '.md']
 # The encoding of source files.
 #source_encoding = 'utf-8-sig'
 # The master toctree document.
 master_doc = 'index'
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
 language = None
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
 #today = ''
 # Else, today_fmt is used as the format for a strftime call.
 #today_fmt = '%B %d, %Y'
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
 #default_role = None
 # If true, '()' will be appended to :func: etc. cross-reference text.
 #add_function_parentheses = True
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
 #add_module_names = True
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
 #show_authors = False
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
 # If true, keep warnings as "system message" paragraphs in the built documents.
 #keep_warnings = False
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
 # -- Options for HTML output ----------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 # html_theme = 'alabaster'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
 # -- Options for LaTeX output ---------------------------------------------
 latex_elements = {
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
  (master_doc, 'rabit.tex', project,
   author, 'manual'),
 ]
 # hook for doxygen
 def run_doxygen(folder):
    """Run the doxygen make command in the designated folder."""
    try:
        retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
        if retcode < 0:
            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
    except OSError as e:
        sys.stderr.write("doxygen execution failed: %s" % e)
 def run_build_lib(folder):
    """Run the doxygen make command in the designated folder."""
    try:
        retcode = subprocess.call("cd %s; make" % folder, shell=True)
        retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
        retcode = subprocess.call("mkdir _build", shell=True)
        retcode = subprocess.call("mkdir _build/html", shell=True)
        retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
        if retcode < 0:
            sys.stderr.write("build terminated by signal %s" % (-retcode))
    except OSError as e:
        sys.stderr.write("build execution failed: %s" % e)
 def generate_doxygen_xml(app):
    """Run the doxygen make commands if we're on the ReadTheDocs server"""
    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
    if read_the_docs_build:
        run_doxygen('..')
        sys.stderr.write('Check if shared lib exists\n')
        run_build_lib('..')
    sys.stderr.write('The wrapper path: %s\n' % str(os.listdir('../wrapper')))
    rabit._loadlib()
 def setup(app):
    # Add hook for building doxygen xml when needed
    app.connect("builder-inited", generate_doxygen_xml)
    app.add_config_value('recommonmark_config', {
            'url_resolver': lambda url: github_doc_root + url,
            }, True)
    app.add_transform(AutoStructify)
--- a/rabit/doc/cpp_api.md
+++ b/rabit/doc/cpp_api.md
@ -1,9 +0,0 @@
 C++ Library API of Rabit
 ========================
 This page contains document of Library API of rabit.
 ```eval_rst
 .. toctree::
 .. doxygennamespace:: rabit
 ```
--- a/rabit/doc/guide.md
+++ b/rabit/doc/guide.md
@ -1,383 +0,0 @@
 Tutorial
 ========
 This is rabit's tutorial, a ***Reliable Allreduce and Broadcast Interface***.
 All the example codes are in the [guide](https://github.com/dmlc/rabit/blob/master/guide/) folder of the project.
 To run the examples locally, you will need to build them with ```make```.
 **List of Topics**
 * [What is Allreduce](#what-is-allreduce)
 * [Common Use Case](#common-use-case)
 * [Use Rabit API](#use-rabit-api)
  - [Structure of a Rabit Program](#structure-of-a-rabit-program)
  - [Allreduce and Lazy Preparation](#allreduce-and-lazy-preparation)
  - [Checkpoint and LazyCheckpoint](#checkpoint-and-lazycheckpoint)
 * [Compile Programs with Rabit](#compile-programs-with-rabit)
 * [Running Rabit Jobs](#running-rabit-jobs)
 * [Fault Tolerance](#fault-tolerance)
 What is Allreduce
 -----------------
 The main methods provided by rabit are Allreduce and Broadcast. Allreduce performs reduction across different computation nodes,
 and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](../guide/basic.cc) (there is a python example right after this if you are more familiar with python).
 ```c++
 #include <rabit.h>
 using namespace rabit;
 const int N = 3;
 int main(int argc, char *argv[]) {
  int a[N];
  rabit::Init(argc, argv);
  for (int i = 0; i < N; ++i) {
    a[i] = rabit::GetRank() + i;
  }
  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // allreduce take max of each elements in all processes
  Allreduce<op::Max>(&a[0], N);
  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // second allreduce that sums everything up
  Allreduce<op::Sum>(&a[0], N);
  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  rabit::Finalize();
  return 0;
 }
 ```
 You can run the example using the rabit_demo.py script. The following command
 starts the rabit program with two worker processes.
 ```bash
 ../tracker/rabit_demo.py -n 2 basic.rabit
 ```
 This will start two processes, one process with rank 0 and the other with rank 1, both processes run the same code.
 The ```rabit::GetRank()``` function returns the rank of current process.
 Before the call to Allreduce, process 0 contains the array ```a = {0, 1, 2}```, while process 1 has the array
 ```a = {1, 2, 3}```. After the call to Allreduce, the array contents in all processes are replaced by the
 reduction result (in this case, the maximum value in each position across all the processes). So, after the
 Allreduce call, the result will become ```a = {1, 2, 3}```.
 Rabit provides different reduction operators, for example,  if you change ```op::Max``` to ```op::Sum```,
 the reduction operation will be a summation, and the result will become ```a = {1, 3, 5}```.
 You can also run the example with different processes by setting -n to different values.
 If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](../guide/basic.py):
 ```python
 import numpy as np
 import rabit
 rabit.init()
 n = 3
 rank = rabit.get_rank()
 a = np.zeros(n)
 for i in xrange(n):
    a[i] = rank + i
 print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
 a = rabit.allreduce(a, rabit.MAX)
 print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
 a = rabit.allreduce(a, rabit.SUM)
 print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
 rabit.finalize()
 ```
 You can run the program using the following command
 ```bash
 ../tracker/rabit_demo.py -n 2 basic.py
 ```
 Broadcast is another method provided by rabit besides Allreduce. This function allows one node to broadcast its
 local data to all other nodes. The following code in [broadcast.cc](../guide/broadcast.cc) broadcasts a string from
 node 0 to all other nodes.
 ```c++
 #include <rabit.h>
 using namespace rabit;
 const int N = 3;
 int main(int argc, char *argv[]) {
  rabit::Init(argc, argv);
  std::string s;
  if (rabit::GetRank() == 0) s = "hello world";
  printf("@node[%d] before-broadcast: s=\"%s\"\n",
         rabit::GetRank(), s.c_str());
  // broadcast s from node 0 to all other nodes
  rabit::Broadcast(&s, 0);
  printf("@node[%d] after-broadcast: s=\"%s\"\n",
         rabit::GetRank(), s.c_str());
  rabit::Finalize();
  return 0;
 }
 ```
 The following command starts the program with three worker processes.
 ```bash
 ../tracker/rabit_demo.py -n 3 broadcast.rabit
 ```
 Besides strings, rabit also allows to broadcast constant size array and vectors.
 The counterpart in python can be found in [broadcast.py](../guide/broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
 ```python
 import rabit
 rabit.init()
 n = 3
 rank = rabit.get_rank()
 s = None
 if rank == 0:
    s = {'hello world':100, 2:3}
 print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
 s = rabit.broadcast(s, 0)
 print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
 rabit.finalize()
 ```
 Common Use Case
 ---------------
 Many distributed machine learning algorithms involve splitting the data into different nodes,
 computing statistics locally, and finally aggregating them. Such workflow is usually done repetitively through many iterations before the algorithm converges. Allreduce naturally meets the structure of such programs,
 common use cases include:
 * Aggregation of gradient values, which can be used in optimization methods such as L-BFGS.
 * Aggregation of other statistics, which can be used in KMeans and Gaussian Mixture Models.
 * Find the best split candidate and aggregation of split statistics, used for tree based models.
 Rabit is a reliable and portable library for distributed machine learning programs, that allow programs to run reliably on different platforms.
 Use Rabit API
 -------------
 This section introduces topics about how to use rabit API.
 You can always refer to [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for definition of each functions.
 This section trys to gives examples of different aspectes of rabit API.
 #### Structure of a Rabit Program
 The following code illustrates the common structure of a rabit program. This is an abstract example,
 you can also refer to [wormhole](https://github.com/dmlc/wormhole/blob/master/learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
 ```c++
 #include <rabit.h>
 int main(int argc, char *argv[]) {
  ...
  rabit::Init(argc, argv);
  // sync on expected model size before load checkpoint, if we pass rabit_bootstrap_cache=true
  rabit::Allreduce<rabit::op::Max>(&model.size(), 1);
  // load the latest checked model
  int version = rabit::LoadCheckPoint(&model);
  // initialize the model if it is the first version
  if (version == 0) model.InitModel();
  // the version number marks the iteration to resume
  for (int iter = version; iter < max_iter; ++iter) {
    // at this point, the model object should allow us to recover the program state
    ...
    // each iteration can contain multiple calls of allreduce/broadcast
    rabit::Allreduce<rabit::op::Max>(&data[0], n);
    ...
    // checkpoint model after one iteration finishes
    rabit::CheckPoint(&model);
  }
  rabit::Finalize();
  return 0;
 }
 ```
 Besides the common Allreduce and Broadcast functions, there are two additional functions: ```LoadCheckPoint```
 and ```CheckPoint```. These two functions are used for fault-tolerance purposes.
 As mentioned before, traditional machine learning programs involve several iterations. In each iteration, we start with a model, make some calls
 to Allreduce or Broadcast and update the model. The calling sequence in each iteration does not need to be the same.
 * When the nodes start from the beginning (i.e. iteration 0), ```LoadCheckPoint``` returns 0, so we can initialize the model.
 * ```CheckPoint``` saves the model after each iteration.
  - Efficiency Note: the model is only kept in local memory and no save to disk is performed when calling Checkpoint
 * When a node goes down and restarts, ```LoadCheckPoint``` will recover the latest saved model, and
 * When a node goes down, the rest of the nodes will block in the call of Allreduce/Broadcast and wait for
  the recovery of the failed node until it catches up.
 Please see the [Fault Tolerance](#fault-tolerance) section to understand the recovery procedure executed by rabit.
 #### Allreduce and Lazy Preparation
 Allreduce is one of the most important function provided by rabit. You can call allreduce by specifying the
 reduction operator, pointer to the data and size of the buffer, as follows
 ```c++
 Allreduce<operator>(pointer_of_data, size_of_data);
 ```
 This is the basic use case of Allreduce function. It is common that user writes the code to prepare the data needed
 into the data buffer, pass the data to Allreduce function, and get the reduced result. However, when a node restarts
 from failure, we can directly recover the result from other nodes(see also [Fault Tolerance](#fault-tolerance)) and
 the data preparation procedure no longer necessary. Rabit Allreduce add an optional parameter preparation function
 to support such scenario. User can pass in a function that corresponds to the data preparation procedure to Allreduce
 calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](../guide/lazy_allreduce.cc)
 as an example to demonstrate this feature. It is modified from [basic.cc](../guide/basic.cc), and you can compare the two codes.
 ```c++
 #include <rabit.h>
 using namespace rabit;
 const int N = 3;
 int main(int argc, char *argv[]) {
  int a[N] = {0};
  rabit::Init(argc, argv);
  // lazy preparation function
  auto prepare = [&]() {
    printf("@node[%d] run prepare function\n", rabit::GetRank());
    for (int i = 0; i < N; ++i) {
      a[i] = rabit::GetRank() + i;
    }
  };
  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // allreduce take max of each elements in all processes
  Allreduce<op::Max>(&a[0], N, prepare);
  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // rum second allreduce
  Allreduce<op::Sum>(&a[0], N);
  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  rabit::Finalize();
  return 0;
 }
 ```
 Here we use features of C++11 because the lambda function makes things much shorter.
 There is also C++ compatible callback interface provided in the [API](http://homes.cs.washington.edu/~tqchen/rabit/doc).
 You can compile the program by typing ```make lazy_allreduce.mock```. We link against the mock library so that we can see
 the effect when a process goes down. You can run the program using the following command
 ```bash
 ../tracker/rabit_demo.py -n 2 lazy_allreduce.mock mock=0,0,1,0
 ```
 The additional arguments ```mock=0,0,1,0``` will cause node 0 to kill itself before second call of Allreduce (see also [mock test](#link-against-mock-test-rabit-library)).
 You will find that the prepare function's print is only executed once and node 0 will no longer execute the preparation function when it restarts from failure.
 You can also find python version of the example in [lazy_allreduce.py](../guide/lazy_allreduce.py), and run it using the followin command
 ```bash
 ../tracker/rabit_demo.py -n 2 lazy_allreduce.py mock=0,0,1,0
 ```
 Since lazy preparation function may not be called during execution. User should be careful when using this feature. For example, a possible mistake
 could be putting some memory allocation code in the lazy preparation function, and the computing memory was not allocated when lazy preparation function is not called.
 The example in [lazy_allreduce.cc](../guide/lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](../guide/basic.cc)) to lazy version: wrap the preparation
 code with a lambda function, and pass it to allreduce.
 #### Checkpoint and LazyCheckpoint
 Common machine learning algorithms usually involves iterative computation. As mentioned in the section ([Structure of a Rabit Program](#structure-of-a-rabit-program)),
 user can and should use Checkpoint to ```save``` the progress so far, so that when a node fails, the latest checkpointed model can be loaded.
 There are two model arguments you can pass to Checkpoint and LoadCheckpoint: ```global_model``` and ```local_model```:
 * ```global_model``` refers to the model that is commonly shared across all the nodes
  - For example, the centriods of clusters in kmeans is shared across all nodes
 * ```local_model``` refers to the model that is specifically tied to the current node
  - For example, in topic modeling, the topic assignments of subset of documents in current node is local model
 Because the different nature of the two types of models, different strategy will be used for them.
 ```global_model``` is simply saved in local memory of each node, while ```local_model``` will replicated to some other
 nodes (selected using a ring replication strategy). The checkpoint is only saved in the memory without touching the disk which makes rabit programs more efficient.
 User is encouraged to use ```global_model``` only when is sufficient for better efficiency.
 To enable a model class to be checked pointed, user can implement a [serialization interface](../include/rabit_serialization.h). The serialization interface already
 provide serialization functions of STL vector and string. For python API, user can checkpoint any python object that can be pickled.
 There is a special Checkpoint function called [LazyCheckpoint](http://homes.cs.washington.edu/~tqchen/rabit/doc/namespacerabit.html#a99f74c357afa5fba2c80cc0363e4e459),
 which can be used for ```global_model``` only cases under certain condition.
 When LazyCheckpoint is called, no action is taken and the rabit engine only remembers the pointer to the model.
 The serialization will only happen when another node fails and the recovery starts. So user basically pays no extra cost calling LazyCheckpoint.
 To use this function, the user need to ensure the model remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
 So that when recovery procedure happens in these function calls, the serialized model will be the same.
 For example, consider the following calling sequence
 ```
 LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
 ```
 The user must only change the model in code3. Such condition can usually be satiesfied in many scenarios, and user can use LazyCheckpoint to further
 improve the efficiency of the program.
 Compile Programs with Rabit
 ---------------------------
 Rabit is a portable library, to use it, you only need to include the rabit header file.
 * You will need to add the path to [../include](../include) to the header search path of the compiler
  - Solution 1: add ```-I/path/to/rabit/include``` to the compiler flag in gcc or clang
  - Solution 2: add the path to the environment variable CPLUS_INCLUDE_PATH
 * You will need to add the path to [../lib](../lib) to the library search path of the compiler
  - Solution 1: add ```-L/path/to/rabit/lib``` to the linker flag
  - Solution 2: add the path to environment variable LIBRARY_PATH AND LD_LIBRARY_PATH
 * Link against lib/rabit.a
  - Add ```-lrabit``` to the linker flag
 The procedure above allows you to compile a program with rabit. The following two sections contain additional
 options you can use to link against different backends other than the normal one.
 #### Link against MPI Allreduce
 You can link against ```rabit_mpi.a``` instead of using MPI Allreduce, however, the resulting program is backed by MPI and
 is not fault tolerant anymore.
 * Simply change the linker flag from ```-lrabit``` to ```-lrabit_mpi```
 * The final linking needs to be done by mpi wrapper compiler ```mpicxx```
 #### Link against Mock Test Rabit Library
 If you want to use a mock to test the program in order to see the behavior of the code when some nodes go down, you can link against ```rabit_mock.a``` .
 * Simply change the linker flag from ```-lrabit``` to ```-lrabit_mock```
 The resulting rabit mock program can take in additional arguments in the following format
 ```
 mock=rank,version,seq,ndeath
 ```
 The four integers specify an event that will cause the program to ```commit suicide```(exit with -2)
 * rank specifies the rank of the node to kill
 * version specifies the version (iteration) of the model where you want the process to die
 * seq specifies the sequence number of the Allreduce/Broadcast call since last checkpoint, where the process will be killed
 * ndeath specifies how many times this node died already
 For example, consider the following script in the test case
 ```bash
 ../tracker/rabit_demo.py -n 10 test_model_recover 10000\
                         mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1
 ```
 * The first mock will cause node 0 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 0
 * The second mock will cause node 1 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 1
 * The third mock will cause node 1 to exit again when calling second Allreduce/Broadcast (seq = 1) in iteration 1
  - Note that ndeath = 1 means this will happen only if node 1 died once, which is our case
 Running Rabit Jobs
 ------------------
 Rabit is a portable library that can run on multiple platforms.
 All the rabit jobs can be submitted using [dmlc-tracker](https://github.com/dmlc/dmlc-core/tree/master/tracker)
 Fault Tolerance
 ---------------
 This section introduces how fault tolerance works in rabit.
 The following figure shows how rabit deals with failures.
 ![](http://homes.cs.washington.edu/~tqchen/rabit/fig/fault-tol.png)
 The scenario is as follows:
 * Node 1 fails between the first and second call of Allreduce after the second checkpoint
 * The other nodes wait in the call of the second Allreduce in order to help node 1 to recover.
 * When node 1 restarts, it will call ```LoadCheckPoint```, and get the latest checkpoint from one of the existing nodes.
 * Then node 1 can start from the latest checkpoint and continue running.
 * When node 1 calls the first Allreduce again, as the other nodes already know the result, node 1 can get it from one of them.
 * When node 1 reaches the second Allreduce, the other nodes find out that node 1 has catched up and they can continue the program normally.
 This fault tolerance model is based on a key property of Allreduce and
 Broadcast: All the nodes get the same result after calling Allreduce/Broadcast.
 Because of this property, any node can record the results of history
 Allreduce/Broadcast calls.  When a node is recovered, it can fetch the lost
 results from some alive nodes and rebuild its model.
 The checkpoint is introduced so that we can discard the history results of
 Allreduce/Broadcast calls before the latest checkpoint. This saves memory
 consumption used for backup.  The checkpoint of each node is a model defined by
 users and can be split into 2 parts: a global model and a local model. The
 global model is shared by all nodes and can be backed up by any nodes. The
 local model of a node is replicated to some other nodes (selected using a ring
 replication strategy).  The checkpoint is only saved in the memory without
 touching the disk which makes rabit programs more efficient.  The strategy of
 rabit is different from the fail-restart strategy where all the nodes restart
 from the same checkpoint when any of them fail.  In rabit, all the alive nodes
 will block in the Allreduce call and help the recovery.  To catch up, the
 recovered node fetches its latest checkpoint and the results of
 Allreduce/Broadcast calls after the checkpoint from some alive nodes.
 This is just a conceptual introduction to rabit's fault tolerance model. The actual implementation is more sophisticated,
 and can deal with more complicated cases such as multiple nodes failure and node failure during recovery phase.
 Rabit Timeout
 ---------------
 In certain cases, rabit cluster may suffer lack of resources to retry failed workers.
 Thanks to fault tolerant assumption with infinite retry, it might cause entire cluster hang infinitely.
 We introduce sidecar thread which runs when rabit fault tolerant runtime observed allreduce/broadcast errors.
 By default, it will wait for 30 mins before all workers program exit. 
 User can opt-in this feature and change treshold by passing rabit_timeout=true and rabit_timeout_sec=x (in seconds).
--- a/rabit/doc/index.md
+++ b/rabit/doc/index.md
@ -1,24 +0,0 @@
 Rabit Documentation
 =====================
 rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support **portable** , **scalable** and **reliable** distributed machine learning programs.
 API Documents
 -------------
 ```eval_rst
 .. toctree::
   :maxdepth: 2
   python_api.md
   cpp_api.md
   parameters.md
   guide.md
 ```
 Indices and tables
 ------------------
 ```eval_rst
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
 ```
--- a/rabit/doc/parameters.md
+++ b/rabit/doc/parameters.md
@ -1,21 +0,0 @@
 Parameters
 ==========
 This section list all the parameters that can be passed to rabit::Init function as argv.
 All the parameters are passed in as string in format of ``parameter-name=parameter-value``.
 In most setting these parameters have default value or will be automatically detected,
 and do not need to be manually configured.
 * rabit_tracker_uri [passed in automatically by tracker]
  - The uri/ip of rabit tracker
 * rabit_tracker_port [passed in automatically by tracker]
  - The port of rabit tracker
 * rabit_task_id [automatically detected]
  - The unique identifier of computing process
  - When running on Hadoop, this is automatically extracted from environment variable
 * rabit_reduce_buffer [default = 256MB]
  - The memory buffer used to store intermediate result of reduction
  - Format "digits + unit", can be 128M, 1G
 * rabit_global_replica [default = 5]
  - Number of replication copies of result kept for each Allreduce/Broadcast call
 * rabit_local_replica [default = 2]
  - Number of replication of local model in check point
--- a/rabit/doc/python-requirements.txt
+++ b/rabit/doc/python-requirements.txt
@ -1,3 +0,0 @@
 numpy
 breathe
 commonmark
--- a/rabit/doc/python_api.md
+++ b/rabit/doc/python_api.md
@ -1,11 +0,0 @@
 Python API of Rabit
 ===================
 This page contains document of python API of rabit.
 ```eval_rst
 .. toctree::
 .. automodule:: rabit
    :members:
    :show-inheritance:
 ```
--- a/rabit/doc/sphinx_util.py
+++ b/rabit/doc/sphinx_util.py
@ -1,16 +0,0 @@
 # -*- coding: utf-8 -*-
 """Helper utilty function for customization."""
 import sys
 import os
 import docutils
 import subprocess
 if os.environ.get('READTHEDOCS', None) == 'True':
    subprocess.call('cd ..; rm -rf recommonmark;' +
                    'git clone https://github.com/tqchen/recommonmark', shell=True)
 sys.path.insert(0, os.path.abspath('../recommonmark/'))
 from recommonmark import parser, transform
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
--- a/rabit/guide/Makefile
+++ b/rabit/guide/Makefile
@ -1,26 +0,0 @@
 export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm -L../lib
 export CFLAGS = -Wall -O3 -msse2 -std=c++11 -Wno-unknown-pragmas -fPIC -fopenmp -I../include
 .PHONY: clean all lib libmpi
 BIN = basic.rabit broadcast.rabit
 MOCKBIN= lazy_allreduce.mock
 all: $(BIN)
 basic.rabit: basic.cc lib ../lib/librabit.a
 broadcast.rabit: broadcast.cc lib ../lib/librabit.a
 lazy_allreduce.mock: lazy_allreduce.cc lib ../lib/librabit.a
 $(BIN) :
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a,  $^) $(LDFLAGS)
 $(MOCKBIN) :
 	$(CXX) $(CFLAGS) -std=c++11 -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit_mock
 $(OBJ) :
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 clean:
 	$(RM) $(OBJ) $(BIN) $(MOCKBIN) *~ ../src/*~
--- a/rabit/guide/README
+++ b/rabit/guide/README
@ -1 +0,0 @@
 See tutorial at ../doc/guide.md
--- a/rabit/guide/basic.cc
+++ b/rabit/guide/basic.cc
@ -1,35 +0,0 @@
 /*!
 *  Copyright (c) 2014 by Contributors
 * \file basic.cc
 * \brief This is an example demonstrating what is Allreduce
 *
 * \author Tianqi Chen
 */
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
 #include <vector>
 #include <rabit/rabit.h>
 using namespace rabit;
 int main(int argc, char *argv[]) {
  int N = 3;
  if (argc > 1) {
    N = atoi(argv[1]);
  }
  std::vector<int> a(N);
  rabit::Init(argc, argv);
  for (int i = 0; i < N; ++i) {
    a[i] = rabit::GetRank() + i;
  }
  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // allreduce take max of each elements in all processes
  Allreduce<op::Max>(&a[0], N);
  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // second allreduce that sums everything up
  Allreduce<op::Sum>(&a[0], N);
  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  rabit::Finalize();
  return 0;
 }
--- a/rabit/guide/basic.py
+++ b/rabit/guide/basic.py
@ -1,27 +0,0 @@
 #!/usr/bin/python
 """
 demo python script of rabit
 """
 from __future__ import print_function
 from builtins import range
 import os
 import sys
 import numpy as np
 # import rabit, the tracker script will setup the lib path correctly
 # for normal run without tracker script, add following line
 # sys.path.append(os.path.dirname(__file__) + '/../python')
 import rabit
 rabit.init()
 n = 3
 rank = rabit.get_rank()
 a = np.zeros(n)
 for i in range(n):
    a[i] = rank + i
 print('@node[%d] before-allreduce: a=%s' % (rank, str(a)))
 a = rabit.allreduce(a, rabit.MAX)
 print('@node[%d] after-allreduce-max: a=%s' % (rank, str(a)))
 a = rabit.allreduce(a, rabit.SUM)
 print('@node[%d] after-allreduce-sum: a=%s' % (rank, str(a)))
 rabit.finalize()
--- a/rabit/guide/broadcast.cc
+++ b/rabit/guide/broadcast.cc
@ -1,16 +0,0 @@
 #include <rabit/rabit.h>
 using namespace rabit;
 const int N = 3;
 int main(int argc, char *argv[]) {
  rabit::Init(argc, argv);
  std::string s;
  if (rabit::GetRank() == 0) s = "hello world";
  printf("@node[%d] before-broadcast: s=\"%s\"\n",
         rabit::GetRank(), s.c_str());
  // broadcast s from node 0 to all other nodes
  rabit::Broadcast(&s, 0);
  printf("@node[%d] after-broadcast: s=\"%s\"\n",
         rabit::GetRank(), s.c_str());
  rabit::Finalize();
  return 0;
 }
--- a/rabit/guide/broadcast.py
+++ b/rabit/guide/broadcast.py
@ -1,23 +0,0 @@
 #!/usr/bin/python
 """
 demo python script of rabit
 """
 from __future__ import print_function
 import os
 import sys
 # add path to wrapper
 # for normal run without tracker script, add following line
 # sys.path.append(os.path.dirname(__file__) + '/../wrapper')
 import rabit
 rabit.init()
 n = 3
 rank = rabit.get_rank()
 s = None
 if rank == 0:
    s = {'hello world':100, 2:3}
 print('@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s)))
 s = rabit.broadcast(s, 0)
 print('@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s)))
 rabit.finalize()
--- a/rabit/guide/lazy_allreduce.cc
+++ b/rabit/guide/lazy_allreduce.cc
@ -1,34 +0,0 @@
 /*!
 *  Copyright (c) 2014 by Contributors
 * \file basic.cc
 * \brief This is an example demonstrating what is Allreduce
 *
 * \author Tianqi Chen
 */
 #include <rabit/rabit.h>
 using namespace rabit;
 const int N = 3;
 int main(int argc, char *argv[]) {
  int a[N] = {0};
  rabit::Init(argc, argv);
  // lazy preparation function
  auto prepare = [&]() {
    printf("@node[%d] run prepare function\n", rabit::GetRank());
    for (int i = 0; i < N; ++i) {
      a[i] = rabit::GetRank() + i;
    }
  };
  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // allreduce take max of each elements in all processes
  Allreduce<op::Max>(&a[0], N, prepare);
  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  // rum second allreduce
  Allreduce<op::Sum>(&a[0], N);
  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
         rabit::GetRank(), a[0], a[1], a[2]);
  rabit::Finalize();
  return 0;
 }
--- a/rabit/guide/lazy_allreduce.py
+++ b/rabit/guide/lazy_allreduce.py
@ -1,31 +0,0 @@
 #!/usr/bin/python
 """
 demo python script of rabit: Lazy preparation function
 """
 import os
 import sys
 import numpy as np
 # import rabit, the tracker script will setup the lib path correctly
 # for normal run without tracker script, add following line
 # sys.path.append(os.path.dirname(__file__) + '/../wrapper')
 import rabit
 # use mock library so that we can run failure test
 rabit.init(lib = 'mock')
 n = 3
 rank = rabit.get_rank()
 a = np.zeros(n)
 def prepare(a):
    print('@node[%d] run prepare function' % rank)
    # must take in reference and modify the reference
    for i in range(n):
        a[i] = rank + i
 print('@node[%d] before-allreduce: a=%s' % (rank, str(a)))
 a = rabit.allreduce(a, rabit.MAX, prepare_fun = prepare)
 print('@node[%d] after-allreduce-max: a=%s' % (rank, str(a)))
 a = rabit.allreduce(a, rabit.SUM)
 print('@node[%d] after-allreduce-sum: a=%s' % (rank, str(a)))
 rabit.finalize()
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@ -3,7 +3,11 @@
 */
 #include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
 #include "../common/threading_utils.h"
 #if defined(XGBOOST_USE_CUDA)
 #include "../data/device_adapter.cuh"
 #elif defined(XGBOOST_USE_HIP)
 #include "../data/device_adapter.hip.h"
 #endif
 #include "../data/proxy_dmatrix.h"
 #include "c_api_error.h"
 #include "c_api_utils.h"
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -2,6 +2,8 @@
 * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
 #if defined(XGBOOST_USE_CUDA)
 #include <thrust/binary_search.h>  // thrust::upper_bound
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
@ -825,176 +827,6 @@ XGBOOST_DEVICE auto tcrend(xgboost::common::Span<T> const &span) {  // NOLINT
  return tcrbegin(span) + span.size();
 }
 // This type sorts an array which is divided into multiple groups. The sorting is influenced
 // by the function object 'Comparator'
 template <typename T>
 class SegmentSorter {
 private:
  // Items sorted within the group
  caching_device_vector<T> ditems_;
  // Original position of the items before they are sorted descending within their groups
  caching_device_vector<uint32_t> doriginal_pos_;
  // Segments within the original list that delineates the different groups
  caching_device_vector<uint32_t> group_segments_;
  // Need this on the device as it is used in the kernels
  caching_device_vector<uint32_t> dgroups_;       // Group information on device
  // Where did the item that was originally present at position 'x' move to after they are sorted
  caching_device_vector<uint32_t> dindexable_sorted_pos_;
  // Initialize everything but the segments
  void Init(uint32_t num_elems) {
    ditems_.resize(num_elems);
    doriginal_pos_.resize(num_elems);
    thrust::sequence(doriginal_pos_.begin(), doriginal_pos_.end());
  }
  // Initialize all with group info
  void Init(const std::vector<uint32_t> &groups) {
    uint32_t num_elems = groups.back();
    this->Init(num_elems);
    this->CreateGroupSegments(groups);
  }
 public:
  // This needs to be public due to device lambda
  void CreateGroupSegments(const std::vector<uint32_t> &groups) {
    uint32_t num_elems = groups.back();
    group_segments_.resize(num_elems, 0);
    dgroups_ = groups;
    if (GetNumGroups() == 1) return;  // There are no segments; hence, no need to compute them
    // Define the segments by assigning a group ID to each element
    const uint32_t *dgroups = dgroups_.data().get();
    uint32_t ngroups = dgroups_.size();
    auto ComputeGroupIDLambda = [=] __device__(uint32_t idx) {
      return thrust::upper_bound(thrust::seq, dgroups, dgroups + ngroups, idx) -
             dgroups - 1;
    };  // NOLINT
    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
                      thrust::make_counting_iterator(num_elems),
                      group_segments_.begin(),
                      ComputeGroupIDLambda);
  }
  // Accessors that returns device pointer
  inline uint32_t GetNumItems() const { return ditems_.size(); }
  inline const xgboost::common::Span<const T> GetItemsSpan() const {
    return { ditems_.data().get(), ditems_.size() };
  }
  inline const xgboost::common::Span<const uint32_t> GetOriginalPositionsSpan() const {
    return { doriginal_pos_.data().get(), doriginal_pos_.size() };
  }
  inline const xgboost::common::Span<const uint32_t> GetGroupSegmentsSpan() const {
    return { group_segments_.data().get(), group_segments_.size() };
  }
  inline uint32_t GetNumGroups() const { return dgroups_.size() - 1; }
  inline const xgboost::common::Span<const uint32_t> GetGroupsSpan() const {
    return { dgroups_.data().get(), dgroups_.size() };
  }
  inline const xgboost::common::Span<const uint32_t> GetIndexableSortedPositionsSpan() const {
    return { dindexable_sorted_pos_.data().get(), dindexable_sorted_pos_.size() };
  }
  // Sort an array that is divided into multiple groups. The array is sorted within each group.
  // This version provides the group information that is on the host.
  // The array is sorted based on an adaptable binary predicate. By default a stateless predicate
  // is used.
  template <typename Comparator = thrust::greater<T>>
  void SortItems(const T *ditems, uint32_t item_size, const std::vector<uint32_t> &groups,
                 const Comparator &comp = Comparator()) {
    this->Init(groups);
    this->SortItems(ditems, item_size, this->GetGroupSegmentsSpan(), comp);
  }
  // Sort an array that is divided into multiple groups. The array is sorted within each group.
  // This version provides the group information that is on the device.
  // The array is sorted based on an adaptable binary predicate. By default a stateless predicate
  // is used.
  template <typename Comparator = thrust::greater<T>>
  void SortItems(const T *ditems, uint32_t item_size,
                 const xgboost::common::Span<const uint32_t> &group_segments,
                 const Comparator &comp = Comparator()) {
    this->Init(item_size);
    // Sort the items that are grouped. We would like to avoid using predicates to perform the sort,
    // as thrust resorts to using a merge sort as opposed to a much much faster radix sort
    // when comparators are used. Hence, the following algorithm is used. This is done so that
    // we can grab the appropriate related values from the original list later, after the
    // items are sorted.
    //
    // Here is the internal representation:
    // dgroups_:          [ 0, 3, 5, 8, 10 ]
    // group_segments_:   0 0 0 | 1 1 | 2 2 2 | 3 3
    // doriginal_pos_:    0 1 2 | 3 4 | 5 6 7 | 8 9
    // ditems_:           1 0 1 | 2 1 | 1 3 3 | 4 4 (from original items)
    //
    // Sort the items first and make a note of the original positions in doriginal_pos_
    // based on the sort
    // ditems_:           4 4 3 3 2 1 1 1 1 0
    // doriginal_pos_:    8 9 6 7 3 0 2 4 5 1
    // NOTE: This consumes space, but is much faster than some of the other approaches - sorting
    //       in kernel, sorting using predicates etc.
    ditems_.assign(thrust::device_ptr<const T>(ditems),
                   thrust::device_ptr<const T>(ditems) + item_size);
    // Allocator to be used by sort for managing space overhead while sorting
    dh::XGBCachingDeviceAllocator<char> alloc;
    thrust::stable_sort_by_key(thrust::cuda::par(alloc),
                               ditems_.begin(), ditems_.end(),
                               doriginal_pos_.begin(), comp);
    if (GetNumGroups() == 1) return;  // The entire array is sorted, as it isn't segmented
    // Next, gather the segments based on the doriginal_pos_. This is to reflect the
    // holisitic item sort order on the segments
    // group_segments_c_:   3 3 2 2 1 0 0 1 2 0
    // doriginal_pos_:      8 9 6 7 3 0 2 4 5 1 (stays the same)
    caching_device_vector<uint32_t> group_segments_c(item_size);
    thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
                   dh::tcbegin(group_segments), group_segments_c.begin());
    // Now, sort the group segments so that you may bring the items within the group together,
    // in the process also noting the relative changes to the doriginal_pos_ while that happens
    // group_segments_c_:   0 0 0 1 1 2 2 2 3 3
    // doriginal_pos_:      0 2 1 3 4 6 7 5 8 9
    thrust::stable_sort_by_key(thrust::cuda::par(alloc),
                               group_segments_c.begin(), group_segments_c.end(),
                               doriginal_pos_.begin(), thrust::less<uint32_t>());
    // Finally, gather the original items based on doriginal_pos_ to sort the input and
    // to store them in ditems_
    // doriginal_pos_:      0 2 1 3 4 6 7 5 8 9  (stays the same)
    // ditems_:             1 1 0 2 1 3 3 1 4 4  (from unsorted items - ditems)
    thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
                   thrust::device_ptr<const T>(ditems), ditems_.begin());
  }
  // Determine where an item that was originally present at position 'x' has been relocated to
  // after a sort. Creation of such an index has to be explicitly requested after a sort
  void CreateIndexableSortedPositions() {
    dindexable_sorted_pos_.resize(GetNumItems());
    thrust::scatter(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
                    thrust::make_counting_iterator(GetNumItems()),  // Rearrange indices...
                    // ...based on this map
                    dh::tcbegin(GetOriginalPositionsSpan()),
                    dindexable_sorted_pos_.begin());  // Write results into this
  }
 };
 // Atomic add function for gradients
 template <typename OutputGradientT, typename InputGradientT>
 XGBOOST_DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
@ -1382,3 +1214,7 @@ class LDGIterator {
  }
 };
 }  // namespace dh
 #elif defined(XGBOOST_USE_HIP)
 #include "device_helpers.hip.h"
 #endif
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@ -8,8 +8,7 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/span.h"                // Span
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 struct OptionalWeights {
  Span<float const> weights;
  float dft{1.0f};  // fixme: make this compile time constant
@ -18,7 +17,8 @@ struct OptionalWeights {
  explicit OptionalWeights(float w) : dft{w} {}
  XGBOOST_DEVICE float operator[](size_t i) const { return weights.empty() ? dft : weights[i]; }
-  auto Empty() const { return weights.empty(); }
+  [[nodiscard]] auto Empty() const { return weights.empty(); }
  [[nodiscard]] auto Size() const { return weights.size(); }
 };
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
@ -28,6 +28,5 @@ inline OptionalWeights MakeOptionalWeights(Context const* ctx,
  }
  return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
 }
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@ -90,6 +90,9 @@ void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid
                                           MetaInfo const &info, float missing) {
  auto const &h_weights =
      (use_group_ind_ ? detail::UnrollGroupWeights(info) : info.weights_.HostVector());
  if (!use_group_ind_ && !h_weights.empty()) {
    CHECK_EQ(h_weights.size(), batch.Size()) << "Invalid size of sample weight.";
  }
  auto is_valid = data::IsValidFunctor{missing};
  auto weights = OptionalWeights{Span<float const>{h_weights}};
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@ -19,12 +19,12 @@
 #include "categorical.h"
 #include "common.h"
 #include "error_msg.h"        // GroupWeight
 #include "optional_weight.h"  // OptionalWeights
 #include "threading_utils.h"
 #include "timer.h"
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 /*!
 * \brief experimental wsummary
 * \tparam DType type of data content
@ -695,13 +695,18 @@ inline std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
    return group_weights;
  }
  size_t n_samples = info.num_row_;
  auto const &group_ptr = info.group_ptr_;
  std::vector<float> results(n_samples);
  CHECK_GE(group_ptr.size(), 2);
-  CHECK_EQ(group_ptr.back(), n_samples);
+
  auto n_groups = group_ptr.size() - 1;
  CHECK_EQ(info.weights_.Size(), n_groups) << error::GroupWeight();
  bst_row_t n_samples = info.num_row_;
  std::vector<float> results(n_samples);
  CHECK_EQ(group_ptr.back(), n_samples)
      << error::GroupSize() << " the number of rows from the data.";
  size_t cur_group = 0;
-  for (size_t i = 0; i < n_samples; ++i) {
+  for (bst_row_t i = 0; i < n_samples; ++i) {
    results[i] = group_weights[cur_group];
    if (i == group_ptr[cur_group + 1]) {
      cur_group++;
@ -1010,6 +1015,5 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
   */
  void PushColPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian);
 };
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_QUANTILE_H_
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@ -114,9 +114,20 @@ void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUS
 DMLC_REGISTER_PARAMETER(LambdaRankParam);
 void PreCache::InitOnCPU(Context const*, MetaInfo const& info) {
  auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
  CheckPreLabels("pre", h_label,
                 [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 #endif  // !defined(XGBOOST_USE_CUDA)
 void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
  auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
-  CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
+  CheckPreLabels("map", h_label,
                 [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@ -216,8 +216,13 @@ void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
              [=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
 }
 void PreCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
  CheckPreLabels("pre", d_label, CheckMAPOp{ctx->CUDACtx()});
 }
 void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
-  CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
+  CheckPreLabels("map", d_label, CheckMAPOp{ctx->CUDACtx()});
 }
 }  // namespace xgboost::ltr
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@ -366,18 +366,43 @@ bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
  });
 }
 /**
- * \brief Validate label for MAP
+ * \brief Validate label for precision-based metric.
 *
 * \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
 *         both CPU and GPU.
 */
 template <typename AllOf>
-void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
+void CheckPreLabels(StringView name, linalg::VectorView<float const> label, AllOf all_of) {
  auto s_label = label.Values();
  auto is_binary = IsBinaryRel(label, all_of);
-  CHECK(is_binary) << "MAP can only be used with binary labels.";
+  CHECK(is_binary) << name << " can only be used with binary labels.";
 }
 class PreCache : public RankingCache {
  HostDeviceVector<double> pre_;
  void InitOnCPU(Context const* ctx, MetaInfo const& info);
  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
 public:
  PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
      : RankingCache{ctx, info, p} {
    if (ctx->IsCPU()) {
      this->InitOnCPU(ctx, info);
    } else {
      this->InitOnCUDA(ctx, info);
    }
  }
  common::Span<double> Pre(Context const* ctx) {
    if (pre_.Empty()) {
      pre_.SetDevice(ctx->gpu_id);
      pre_.Resize(this->Groups());
    }
    return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
  }
 };
 class MAPCache : public RankingCache {
  // Total number of relevant documents for each group
  HostDeviceVector<double> n_rel_;
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -7,14 +7,15 @@
 #include <dmlc/registry.h>
 #include <array>
 #include <cstddef>
 #include <cstring>
 #include "../collective/communicator-inl.h"
 #include "../collective/communicator.h"
 #include "../common/common.h"
 #include "../common/algorithm.h"  // for StableSort
 #include "../common/api_entry.h"  // for XGBAPIThreadLocalEntry
-#include "../common/error_msg.h"  // for InfInData
+#include "../common/common.h"
 #include "../common/error_msg.h"  // for InfInData, GroupWeight, GroupSize
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
@ -35,6 +36,7 @@
 #include "xgboost/context.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/learner.h"
 #include "xgboost/linalg.h"  // Vector
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"
 #include "xgboost/version_config.h"
@ -491,7 +493,7 @@ void MetaInfo::SetInfoFromHost(Context const& ctx, StringView key, Json arr) {
  }
  // uint info
  if (key == "group") {
-    linalg::Tensor<bst_group_t, 1> t;
+    linalg::Vector<bst_group_t> t;
    CopyTensorInfoImpl(ctx, arr, &t);
    auto const& h_groups = t.Data()->HostVector();
    group_ptr_.clear();
@ -516,6 +518,7 @@ void MetaInfo::SetInfoFromHost(Context const& ctx, StringView key, Json arr) {
    data::ValidateQueryGroup(group_ptr_);
    return;
  }
  // float info
  linalg::Tensor<float, 1> t;
  CopyTensorInfoImpl<1>(ctx, arr, &t);
@ -717,58 +720,63 @@ void MetaInfo::SynchronizeNumberOfColumns() {
  }
 }
 namespace {
 template <typename T>
 void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
  CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
      << "Data is resided on a different device than `gpu_id`. "
      << "Device that data is on: " << v.DeviceIdx() << ", "
      << "`gpu_id` for XGBoost: " << device;
 }
 template <typename T, std::int32_t D>
 void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace
 void MetaInfo::Validate(std::int32_t device) const {
  if (group_ptr_.size() != 0 && weights_.Size() != 0) {
-    CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
+    CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
        << "Size of weights must equal to number of groups when ranking "
           "group is used.";
    return;
  }
  if (group_ptr_.size() != 0) {
    CHECK_EQ(group_ptr_.back(), num_row_)
-        << "Invalid group structure.  Number of rows obtained from groups "
+        << error::GroupSize() << "the actual number of rows given by data.";
           "doesn't equal to actual number of rows given by data.";
  }
  auto check_device = [device](HostDeviceVector<float> const& v) {
    CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
        << "Data is resided on a different device than `gpu_id`. "
        << "Device that data is on: " << v.DeviceIdx() << ", "
        << "`gpu_id` for XGBoost: " << device;
  };
  if (weights_.Size() != 0) {
    CHECK_EQ(weights_.Size(), num_row_)
        << "Size of weights must equal to number of rows.";
-    check_device(weights_);
+    CheckDevice(device, weights_);
    return;
  }
  if (labels.Size() != 0) {
    CHECK_EQ(labels.Shape(0), num_row_) << "Size of labels must equal to number of rows.";
-    check_device(*labels.Data());
+    CheckDevice(device, labels);
    return;
  }
  if (labels_lower_bound_.Size() != 0) {
    CHECK_EQ(labels_lower_bound_.Size(), num_row_)
        << "Size of label_lower_bound must equal to number of rows.";
-    check_device(labels_lower_bound_);
+    CheckDevice(device, labels_lower_bound_);
    return;
  }
  if (feature_weights.Size() != 0) {
    CHECK_EQ(feature_weights.Size(), num_col_)
        << "Size of feature_weights must equal to number of columns.";
-    check_device(feature_weights);
+    CheckDevice(device, feature_weights);
  }
  if (labels_upper_bound_.Size() != 0) {
    CHECK_EQ(labels_upper_bound_.Size(), num_row_)
        << "Size of label_upper_bound must equal to number of rows.";
-    check_device(labels_upper_bound_);
+    CheckDevice(device, labels_upper_bound_);
    return;
  }
  CHECK_LE(num_nonzero_, num_col_ * num_row_);
  if (base_margin_.Size() != 0) {
    CHECK_EQ(base_margin_.Size() % num_row_, 0)
        << "Size of base margin must be a multiple of number of rows.";
-    check_device(*base_margin_.Data());
+    CheckDevice(device, base_margin_);
  }
 }
@ -1028,6 +1036,8 @@ SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
 bool SparsePage::IsIndicesSorted(int32_t n_threads) const {
  auto& h_offset = this->offset.HostVector();
  auto& h_data = this->data.HostVector();
  n_threads = std::max(std::min(static_cast<std::size_t>(n_threads), this->Size()),
                       static_cast<std::size_t>(1));
  std::vector<int32_t> is_sorted_tloc(n_threads, 0);
  common::ParallelFor(this->Size(), n_threads, [&](auto i) {
    auto beg = h_offset[i];
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@ -366,8 +366,8 @@ inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, Da
  common::AssertGPUSupport();
 }
-inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
+inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const*,
-                                                                 BatchParam const& param) {
+                                                                 BatchParam const&) {
  common::AssertGPUSupport();
  auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
--- a/src/metric/metric.cc
+++ b/src/metric/metric.cc
@ -52,32 +52,13 @@ Metric::Create(const std::string& name, Context const* ctx) {
  metric->ctx_ = ctx;
  return metric;
 }
 GPUMetric* GPUMetric::CreateGPUMetric(const std::string& name, Context const* ctx) {
  auto metric = CreateMetricImpl<MetricGPUReg>(name);
  if (metric == nullptr) {
    LOG(WARNING) << "Cannot find a GPU metric builder for metric " << name
                 << ". Resorting to the CPU builder";
    return nullptr;
  }
  // Narrowing reference only for the compiler to allow assignment to a base class member.
  // As such, using this narrowed reference to refer to derived members will be an illegal op.
  // This is moot, as this type is stateless.
  auto casted = static_cast<GPUMetric*>(metric);
  CHECK(casted);
  casted->ctx_ = ctx;
  return casted;
 }
 }  // namespace xgboost
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
 DMLC_REGISTRY_ENABLE(::xgboost::MetricGPUReg);
 }
-namespace xgboost {
+namespace xgboost::metric {
 namespace metric {
 // List of files that will be force linked in static links.
 DMLC_REGISTRY_LINK_TAG(auc);
 DMLC_REGISTRY_LINK_TAG(elementwise_metric);
@ -88,5 +69,4 @@ DMLC_REGISTRY_LINK_TAG(rank_metric);
 DMLC_REGISTRY_LINK_TAG(auc_gpu);
 DMLC_REGISTRY_LINK_TAG(rank_metric_gpu);
 #endif
-}  // namespace metric
+}  // namespace xgboost::metric
 }  // namespace xgboost
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@ -23,53 +23,14 @@ class MetricNoCache : public Metric {
  double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
    double result{0.0};
-    auto const& info = p_fmat->Info();
+    auto const &info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+    collective::ApplyWithLabels(info, &result, sizeof(double),
-      result = this->Eval(predts, info);
+                                [&] { result = this->Eval(predts, info); });
    });
    return result;
  }
 };
 // This creates a GPU metric instance dynamically and adds it to the GPU metric registry, if not
 // present already. This is created when there is a device ordinal present and if xgboost
 // is compiled with CUDA support
 struct GPUMetric : public MetricNoCache {
  static GPUMetric *CreateGPUMetric(const std::string &name, Context const *tparam);
 };
 /*!
 * \brief Internal registry entries for GPU Metric factory functions.
 *  The additional parameter const char* param gives the value after @, can be null.
 *  For example, metric map@3, then: param == "3".
 */
 struct MetricGPUReg
  : public dmlc::FunctionRegEntryBase<MetricGPUReg,
                                      std::function<Metric * (const char*)> > {
 };
 /*!
 * \brief Macro to register metric computed on GPU.
 *
 * \code
 * // example of registering a objective ndcg@k
 * XGBOOST_REGISTER_GPU_METRIC(NDCG_GPU, "ndcg")
 * .describe("NDCG metric computer on GPU.")
 * .set_body([](const char* param) {
 *     int at_k = atoi(param);
 *     return new NDCG(at_k);
 *   });
 * \endcode
 */
 // Note: Metric names registered in the GPU registry should follow this convention:
 // - GPU metric types should be registered with the same name as the non GPU metric types
 #define XGBOOST_REGISTER_GPU_METRIC(UniqueId, Name)                         \
  ::xgboost::MetricGPUReg&  __make_ ## MetricGPUReg ## _ ## UniqueId ## __ =  \
      ::dmlc::Registry< ::xgboost::MetricGPUReg>::Get()->__REGISTER__(Name)
 namespace metric {
 // Ranking config to be used on device and host
 struct EvalRankConfig {
 public:
@ -81,8 +42,8 @@ struct EvalRankConfig {
 };
 class PackedReduceResult {
-  double residue_sum_ { 0 };
+  double residue_sum_{0};
-  double weights_sum_ { 0 };
+  double weights_sum_{0};
 public:
  XGBOOST_DEVICE PackedReduceResult() {}  // NOLINT
@ -91,16 +52,15 @@ class PackedReduceResult {
  XGBOOST_DEVICE
  PackedReduceResult operator+(PackedReduceResult const &other) const {
-    return PackedReduceResult{residue_sum_ + other.residue_sum_,
+    return PackedReduceResult{residue_sum_ + other.residue_sum_, weights_sum_ + other.weights_sum_};
                              weights_sum_ + other.weights_sum_};
  }
  PackedReduceResult &operator+=(PackedReduceResult const &other) {
    this->residue_sum_ += other.residue_sum_;
    this->weights_sum_ += other.weights_sum_;
    return *this;
  }
-  double Residue() const { return residue_sum_; }
+  [[nodiscard]] double Residue() const { return residue_sum_; }
-  double Weights() const { return weights_sum_; }
+  [[nodiscard]] double Weights() const { return weights_sum_; }
 };
 }  // namespace metric
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@ -1,25 +1,6 @@
 /**
 * Copyright 2020-2023 by XGBoost contributors
 */
 // When device ordinal is present, we would want to build the metrics on the GPU. It is *not*
 // possible for a valid device ordinal to be present for non GPU builds. However, it is possible
 // for an invalid device ordinal to be specified in GPU builds - to train/predict and/or compute
 // the metrics on CPU. To accommodate these scenarios, the following is done for the metrics
 // accelerated on the GPU.
 // - An internal GPU registry holds all the GPU metric types (defined in the .cu file)
 // - An instance of the appropriate GPU metric type is created when a device ordinal is present
 // - If the creation is successful, the metric computation is done on the device
 // - else, it falls back on the CPU
 // - The GPU metric types are *only* registered when xgboost is built for GPUs
 //
 // This is done for 2 reasons:
 // - Clear separation of CPU and GPU logic
 // - Sorting datasets containing large number of rows is (much) faster when parallel sort
 //   semantics is used on the CPU. The __gnu_parallel/concurrency primitives needed to perform
 //   this cannot be used when the translation unit is compiled using the 'nvcc' compiler (as the
 //   corresponding headers that brings in those function declaration can't be included with CUDA).
 //   This precludes the CPU and GPU logic to coexist inside a .cu file
 #include "rank_metric.h"
 #include <dmlc/omp.h>
@ -57,55 +38,8 @@
 #include "xgboost/string_view.h"             // for StringView
 namespace {
 using PredIndPair = std::pair<xgboost::bst_float, xgboost::ltr::rel_degree_t>;
 using PredIndPairContainer = std::vector<PredIndPair>;
 /*
 * Adapter to access instance weights.
 *
 *  - For ranking task, weights are per-group
 *  - For binary classification task, weights are per-instance
 *
 * WeightPolicy::GetWeightOfInstance() :
 *   get weight associated with an individual instance, using index into
 *   `info.weights`
 * WeightPolicy::GetWeightOfSortedRecord() :
 *   get weight associated with an individual instance, using index into
 *   sorted records `rec` (in ascending order of predicted labels). `rec` is
 *   of type PredIndPairContainer
 */
 class PerInstanceWeightPolicy {
 public:
  inline static xgboost::bst_float
  GetWeightOfInstance(const xgboost::MetaInfo& info,
                      unsigned instance_id, unsigned) {
    return info.GetWeight(instance_id);
  }
  inline static xgboost::bst_float
  GetWeightOfSortedRecord(const xgboost::MetaInfo& info,
                          const PredIndPairContainer& rec,
                          unsigned record_id, unsigned) {
    return info.GetWeight(rec[record_id].second);
  }
 };
 class PerGroupWeightPolicy {
 public:
  inline static xgboost::bst_float
  GetWeightOfInstance(const xgboost::MetaInfo& info,
                      unsigned, unsigned group_id) {
    return info.GetWeight(group_id);
  }
  inline static xgboost::bst_float
  GetWeightOfSortedRecord(const xgboost::MetaInfo& info,
                          const PredIndPairContainer&,
                          unsigned, unsigned group_id) {
    return info.GetWeight(group_id);
  }
 };
 }  // anonymous namespace
 namespace xgboost::metric {
@ -177,10 +111,6 @@ struct EvalAMS : public MetricNoCache {
 /*! \brief Evaluate rank list */
 struct EvalRank : public MetricNoCache, public EvalRankConfig {
 private:
  // This is used to compute the ranking metrics on the GPU - for training jobs that run on the GPU.
  std::unique_ptr<MetricNoCache> rank_gpu_;
 public:
  double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
    CHECK_EQ(preds.Size(), info.labels.Size())
@ -199,20 +129,10 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
    // sum statistics
    double sum_metric = 0.0f;
    // Check and see if we have the GPU metric registered in the internal registry
    if (ctx_->gpu_id >= 0) {
      if (!rank_gpu_) {
        rank_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), ctx_));
      }
      if (rank_gpu_) {
        sum_metric = rank_gpu_->Eval(preds, info);
      }
    }
    CHECK(ctx_);
    std::vector<double> sum_tloc(ctx_->Threads(), 0.0);
-    if (!rank_gpu_ || ctx_->gpu_id < 0) {
+    {
      const auto& labels = info.labels.View(Context::kCpuId);
      const auto &h_preds = preds.ConstHostVector();
@ -253,23 +173,6 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
  virtual double EvalGroup(PredIndPairContainer *recptr) const = 0;
 };
 /*! \brief Precision at N, for both classification and rank */
 struct EvalPrecision : public EvalRank {
 public:
  explicit EvalPrecision(const char* name, const char* param) : EvalRank(name, param) {}
  double EvalGroup(PredIndPairContainer *recptr) const override {
    PredIndPairContainer &rec(*recptr);
    // calculate Precision
    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
    unsigned nhit = 0;
    for (size_t j = 0; j < rec.size() && j < this->topn; ++j) {
      nhit += (rec[j].second != 0);
    }
    return static_cast<double>(nhit) / this->topn;
  }
 };
 /*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
 struct EvalCox : public MetricNoCache {
 public:
@ -312,7 +215,7 @@ struct EvalCox : public MetricNoCache {
    return out/num_events;  // normalize by the number of events
  }
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
    return "cox-nloglik";
  }
 };
@ -321,10 +224,6 @@ XGBOOST_REGISTER_METRIC(AMS, "ams")
 .describe("AMS metric for higgs.")
 .set_body([](const char* param) { return new EvalAMS(param); });
 XGBOOST_REGISTER_METRIC(Precision, "pre")
 .describe("precision@k for rank.")
 .set_body([](const char* param) { return new EvalPrecision("pre", param); });
 XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
 .describe("Negative log partial likelihood of Cox proportional hazards model.")
 .set_body([](const char*) { return new EvalCox(); });
@ -387,6 +286,8 @@ class EvalRankWithCache : public Metric {
    return result;
  }
  [[nodiscard]] const char* Name() const override { return name_.c_str(); }
  virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
                      std::shared_ptr<Cache> p_cache) = 0;
 };
@ -408,6 +309,52 @@ double Finalize(MetaInfo const& info, double score, double sw) {
 }
 }  // namespace
 class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
 public:
  using EvalRankWithCache::EvalRankWithCache;
  double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
              std::shared_ptr<ltr::PreCache> p_cache) final {
    auto n_groups = p_cache->Groups();
    if (!info.weights_.Empty()) {
      CHECK_EQ(info.weights_.Size(), n_groups) << error::GroupWeight();
    }
    if (ctx_->IsCUDA()) {
      auto pre = cuda_impl::PreScore(ctx_, info, predt, p_cache);
      return Finalize(info, pre.Residue(), pre.Weights());
    }
    auto gptr = p_cache->DataGroupPtr(ctx_);
    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
    auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
    auto pre = p_cache->Pre(ctx_);
    common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
      auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
      auto g_rank = rank_idx.subspan(gptr[g], gptr[g + 1] - gptr[g]);
      auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
      double n_hits{0.0};
      for (std::size_t i = 0; i < n; ++i) {
        n_hits += g_label(g_rank[i]) * weight[g];
      }
      pre[g] = n_hits / static_cast<double>(n);
    });
    auto sw = 0.0;
    for (std::size_t i = 0; i < pre.size(); ++i) {
      sw += weight[i];
    }
    auto sum = std::accumulate(pre.cbegin(), pre.cend(), 0.0);
    return Finalize(info, sum, sw);
  }
 };
 /**
 * \brief Implement the NDCG score function for learning to rank.
 *
@ -416,7 +363,6 @@ double Finalize(MetaInfo const& info, double score, double sw) {
 class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
 public:
  using EvalRankWithCache::EvalRankWithCache;
  const char* Name() const override { return name_.c_str(); }
  double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
              std::shared_ptr<ltr::NDCGCache> p_cache) override {
@ -475,7 +421,6 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
 class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
 public:
  using EvalRankWithCache::EvalRankWithCache;
  const char* Name() const override { return name_.c_str(); }
  double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
              std::shared_ptr<ltr::MAPCache> p_cache) override {
@ -494,7 +439,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
    common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
      auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
-      auto g_rank = rank_idx.subspan(gptr[g]);
+      auto g_rank = rank_idx.subspan(gptr[g], gptr[g + 1] - gptr[g]);
      auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
      double n_hits{0.0};
@ -527,6 +472,10 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
  }
 };
 XGBOOST_REGISTER_METRIC(Precision, "pre")
    .describe("precision@k for rank.")
    .set_body([](const char* param) { return new EvalPrecision("pre", param); });
 XGBOOST_REGISTER_METRIC(EvalMAP, "map")
    .describe("map@k for ranking.")
    .set_body([](char const* param) {
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@ -34,124 +34,57 @@ namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);
 /*! \brief Evaluate rank list on GPU */
 template <typename EvalMetricT>
 struct EvalRankGpu : public GPUMetric, public EvalRankConfig {
 public:
  double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
    // Sanity check is done by the caller
    std::vector<unsigned> tgptr(2, 0);
    tgptr[1] = static_cast<unsigned>(preds.Size());
    const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
    const auto ngroups = static_cast<bst_omp_uint>(gptr.size() - 1);
    auto device = ctx_->gpu_id;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device));
 #endif
    info.labels.SetDevice(device);
    preds.SetDevice(device);
    auto dpreds = preds.ConstDevicePointer();
    auto dlabels = info.labels.View(device);
    // Sort all the predictions
    dh::SegmentSorter<float> segment_pred_sorter;
    segment_pred_sorter.SortItems(dpreds, preds.Size(), gptr);
    // Compute individual group metric and sum them up
    return EvalMetricT::EvalMetric(segment_pred_sorter, dlabels.Values().data(), *this);
  }
  const char* Name() const override {
    return name.c_str();
  }
  explicit EvalRankGpu(const char* name, const char* param) {
    using namespace std;  // NOLINT(*)
    if (param != nullptr) {
      std::ostringstream os;
      if (sscanf(param, "%u[-]?", &this->topn) == 1) {
        os << name << '@' << param;
        this->name = os.str();
      } else {
        os << name << param;
        this->name = os.str();
      }
      if (param[strlen(param) - 1] == '-') {
        this->minus = true;
      }
    } else {
      this->name = name;
    }
  }
 };
 /*! \brief Precision at N, for both classification and rank */
 struct EvalPrecisionGpu {
 public:
  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
                           const float *dlabels,
                           const EvalRankConfig &ecfg) {
    // Group info on device
    const auto &dgroups = pred_sorter.GetGroupsSpan();
    const auto ngroups = pred_sorter.GetNumGroups();
    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
    // Original positions of the predictions after they have been sorted
    const auto &dpreds_orig_pos = pred_sorter.GetOriginalPositionsSpan();
    // First, determine non zero labels in the dataset individually
    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
      return (static_cast<unsigned>(dlabels[dpreds_orig_pos[idx]]) != 0) ? 1 : 0;
    };  // NOLINT
    // Find each group's metric sum
    dh::caching_device_vector<uint32_t> hits(ngroups, 0);
    const auto nitems = pred_sorter.GetNumItems();
    auto *dhits = hits.data().get();
    int device_id = -1;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaGetDevice(&device_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipGetDevice(&device_id));
 #endif
    // For each group item compute the aggregated precision
    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
      const auto group_idx = dgroup_idx[idx];
      const auto group_begin = dgroups[group_idx];
      const auto ridx = idx - group_begin;
      if (ridx < ecfg.topn && DetermineNonTrivialLabelLambda(idx)) {
        atomicAdd(&dhits[group_idx], 1);
      }
    });
    // Allocator to be used for managing space overhead while performing reductions
    dh::XGBCachingDeviceAllocator<char> alloc;
 #if defined(XGBOOST_USE_CUDA)
    return static_cast<double>(thrust::reduce(thrust::cuda::par(alloc),
                                              hits.begin(), hits.end())) / ecfg.topn;
 #elif defined(XGBOOST_USE_HIP)
    return static_cast<double>(thrust::reduce(thrust::hip::par(alloc),
                                              hits.begin(), hits.end())) / ecfg.topn;
 #endif
  }
 };
 XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
 .describe("precision@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });
 namespace cuda_impl {
 PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt,
                            std::shared_ptr<ltr::PreCache> p_cache) {
  auto d_gptr = p_cache->DataGroupPtr(ctx);
  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
  predt.SetDevice(ctx->gpu_id);
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
  auto topk = p_cache->Param().TopK();
  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
  auto it = dh::MakeTransformIterator<double>(
      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
        auto g = dh::SegmentId(d_gptr, i);
        auto g_begin = d_gptr[g];
        auto g_end = d_gptr[g + 1];
        i -= g_begin;
        auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
        auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
        double y = g_label(g_rank[i]);
        auto n = std::min(static_cast<std::size_t>(topk), g_label.Size());
        double w{d_weight[g]};
        if (i >= n) {
          return 0.0;
        }
        return y / static_cast<double>(n) * w;
      });
  auto cuctx = ctx->CUDACtx();
  auto pre = p_cache->Pre(ctx);
  thrust::fill_n(cuctx->CTP(), pre.data(), pre.size(), 0.0);
  std::size_t bytes;
  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, it, pre.data(), p_cache->Groups(), d_gptr.data(),
                                  d_gptr.data() + 1, cuctx->Stream());
  dh::TemporaryArray<char> temp(bytes);
  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, it, pre.data(), p_cache->Groups(),
                                  d_gptr.data(), d_gptr.data() + 1, cuctx->Stream());
  auto w_it =
      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul),
                                        [=] XGBOOST_DEVICE(std::size_t g) { return d_weight[g]; });
  auto n_weights = p_cache->Groups();
  auto sw = dh::Reduce(cuctx->CTP(), w_it, w_it + n_weights, 0.0, thrust::plus<double>{});
  auto sum =
      dh::Reduce(cuctx->CTP(), dh::tcbegin(pre), dh::tcend(pre), 0.0, thrust::plus<double>{});
  auto result = PackedReduceResult{sum, sw};
  return result;
 }
 PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
                             HostDeviceVector<float> const &predt, bool minus,
                             std::shared_ptr<ltr::NDCGCache> p_cache) {
@ -174,6 +107,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
  ltr::cuda_impl::CalcQueriesDCG(ctx, d_label, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(),
                                 d_out_dcg);
  auto it = dh::MakeTransformIterator<PackedReduceResult>(
      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
        if (d_inv_idcg(i) <= 0.0) {
--- a/src/metric/rank_metric.h
+++ b/src/metric/rank_metric.h
@ -3,7 +3,7 @@
 /**
 * Copyright 2023 by XGBoost Contributors
 */
-#include <memory>                        // for shared_ptr
+#include <memory>  // for shared_ptr
 #include "../common/common.h"            // for AssertGPUSupport
 #include "../common/ranking_utils.h"     // for NDCGCache, MAPCache
@ -12,9 +12,7 @@
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
-namespace xgboost {
+namespace xgboost::metric::cuda_impl {
 namespace metric {
 namespace cuda_impl {
 PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
                             HostDeviceVector<float> const &predt, bool minus,
                             std::shared_ptr<ltr::NDCGCache> p_cache);
@ -23,6 +21,10 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt, bool minus,
                            std::shared_ptr<ltr::MAPCache> p_cache);
 PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt,
                            std::shared_ptr<ltr::PreCache> p_cache);
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
                                    HostDeviceVector<float> const &, bool,
@ -37,8 +39,13 @@ inline PackedReduceResult MAPScore(Context const *, MetaInfo const &,
  common::AssertGPUSupport();
  return {};
 }
 inline PackedReduceResult PreScore(Context const *, MetaInfo const &,
                                   HostDeviceVector<float> const &,
                                   std::shared_ptr<ltr::PreCache>) {
  common::AssertGPUSupport();
  return {};
 }
 #endif
-}  // namespace cuda_impl
+}  // namespace xgboost::metric::cuda_impl
 }  // namespace metric
 }  // namespace xgboost
 #endif  // XGBOOST_METRIC_RANK_METRIC_H_
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@ -191,6 +191,15 @@ struct SparsePageView {
  size_t Size() const { return view.Size(); }
 };
 struct SingleInstanceView {
  bst_row_t base_rowid{};
  SparsePage::Inst const &inst;
  explicit SingleInstanceView(SparsePage::Inst const &instance) : inst{instance} {}
  SparsePage::Inst operator[](size_t) { return inst; }
  static size_t Size() { return 1; }
 };
 struct GHistIndexMatrixView {
 private:
  GHistIndexMatrix const &page_;
@ -409,6 +418,24 @@ class ColumnSplitHelper {
    }
  }
  void PredictInstance(SparsePage::Inst const &inst, std::vector<bst_float> *out_preds) {
    CHECK(xgboost::collective::IsDistributed())
        << "column-split prediction is only supported for distributed training";
    PredictBatchKernel<SingleInstanceView, 1>(SingleInstanceView{inst}, out_preds);
  }
  void PredictLeaf(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
    CHECK(xgboost::collective::IsDistributed())
        << "column-split prediction is only supported for distributed training";
    for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
      CHECK_EQ(out_preds->size(),
               p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(SparsePageView{&batch}, out_preds);
    }
  }
 private:
  using BitVector = RBitField8;
@ -498,24 +525,31 @@ class ColumnSplitHelper {
    return nid;
  }
  template <bool predict_leaf = false>
  bst_float PredictOneTree(std::size_t tree_id, std::size_t row_id) {
    auto const &tree = *model_.trees[tree_id];
    auto const leaf = GetLeafIndex(tree, tree_id, row_id);
-    return tree[leaf].LeafValue();
+    if constexpr (predict_leaf) {
      return static_cast<bst_float>(leaf);
    } else {
      return tree[leaf].LeafValue();
    }
  }
  template <bool predict_leaf = false>
  void PredictAllTrees(std::vector<bst_float> *out_preds, std::size_t batch_offset,
                       std::size_t predict_offset, std::size_t num_group, std::size_t block_size) {
    auto &preds = *out_preds;
    for (size_t tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
      auto const gid = model_.tree_info[tree_id];
      for (size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] += PredictOneTree(tree_id, batch_offset + i);
+        preds[(predict_offset + i) * num_group + gid] +=
            PredictOneTree<predict_leaf>(tree_id, batch_offset + i);
      }
    }
  }
-  template <typename DataView, size_t block_of_rows_size>
+  template <typename DataView, size_t block_of_rows_size, bool predict_leaf = false>
  void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds) {
    auto const num_group = model_.learner_model_param->num_output_group;
@ -544,8 +578,8 @@ class ColumnSplitHelper {
      auto const batch_offset = block_id * block_of_rows_size;
      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
                                       static_cast<std::size_t>(block_of_rows_size));
-      PredictAllTrees(out_preds, batch_offset, batch_offset + batch.base_rowid, num_group,
+      PredictAllTrees<predict_leaf>(out_preds, batch_offset, batch_offset + batch.base_rowid,
-                      block_size);
+                                    num_group, block_size);
    });
    ClearBitVectors();
@ -728,18 +762,25 @@ class CPUPredictor : public Predictor {
    return true;
  }
-  void PredictInstance(const SparsePage::Inst& inst,
+  void PredictInstance(const SparsePage::Inst &inst, std::vector<bst_float> *out_preds,
-                       std::vector<bst_float>* out_preds,
+                       const gbm::GBTreeModel &model, unsigned ntree_limit,
-                       const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+                       bool is_column_split) const override {
    CHECK(!model.learner_model_param->IsVectorLeaf()) << "predict instance" << MTNotImplemented();
    std::vector<RegTree::FVec> feat_vecs;
    feat_vecs.resize(1, RegTree::FVec());
    feat_vecs[0].Init(model.learner_model_param->num_feature);
    ntree_limit *= model.learner_model_param->num_output_group;
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
      ntree_limit = static_cast<unsigned>(model.trees.size());
    }
    out_preds->resize(model.learner_model_param->num_output_group);
    if (is_column_split) {
      ColumnSplitHelper helper(this->ctx_->Threads(), model, 0, ntree_limit);
      helper.PredictInstance(inst, out_preds);
      return;
    }
    std::vector<RegTree::FVec> feat_vecs;
    feat_vecs.resize(1, RegTree::FVec());
    feat_vecs[0].Init(model.learner_model_param->num_feature);
    auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
    // loop over output groups
    for (uint32_t gid = 0; gid < model.learner_model_param->num_output_group; ++gid) {
@ -752,16 +793,23 @@ class CPUPredictor : public Predictor {
  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_preds,
                   const gbm::GBTreeModel &model, unsigned ntree_limit) const override {
    auto const n_threads = this->ctx_->Threads();
    std::vector<RegTree::FVec> feat_vecs;
    const int num_feature = model.learner_model_param->num_feature;
    InitThreadTemp(n_threads, &feat_vecs);
    const MetaInfo &info = p_fmat->Info();
    // number of valid trees
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
      ntree_limit = static_cast<unsigned>(model.trees.size());
    }
    const MetaInfo &info = p_fmat->Info();
    std::vector<bst_float> &preds = out_preds->HostVector();
    preds.resize(info.num_row_ * ntree_limit);
    if (p_fmat->Info().IsColumnSplit()) {
      ColumnSplitHelper helper(n_threads, model, 0, ntree_limit);
      helper.PredictLeaf(p_fmat, &preds);
      return;
    }
    std::vector<RegTree::FVec> feat_vecs;
    const int num_feature = model.learner_model_param->num_feature;
    InitThreadTemp(n_threads, &feat_vecs);
    // start collecting the prediction
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      // parallel over local batch
@ -796,6 +844,8 @@ class CPUPredictor : public Predictor {
                           int condition, unsigned condition_feature) const override {
    CHECK(!model.learner_model_param->IsVectorLeaf())
        << "Predict contribution" << MTNotImplemented();
    CHECK(!p_fmat->Info().IsColumnSplit())
        << "Predict contribution support for column-wise data split is not yet implemented.";
    auto const n_threads = this->ctx_->Threads();
    const int num_feature = model.learner_model_param->num_feature;
    std::vector<RegTree::FVec> feat_vecs;
@ -877,6 +927,8 @@ class CPUPredictor : public Predictor {
                                       bool approximate) const override {
    CHECK(!model.learner_model_param->IsVectorLeaf())
        << "Predict interaction contribution" << MTNotImplemented();
    CHECK(!p_fmat->Info().IsColumnSplit()) << "Predict interaction contribution support for "
                                              "column-wise data split is not yet implemented.";
    const MetaInfo& info = p_fmat->Info();
    const int ngroup = model.learner_model_param->num_output_group;
    size_t const ncolumns = model.learner_model_param->num_feature;
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -14,9 +14,15 @@
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
 #if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #elif defined(XGBOOST_USE_HIP)
 #include "../common/device_helpers.hip.h"
 #include "../data/device_adapter.hip.h"
 #include "../data/ellpack_page.hip.h"
 #endif
 #include "../data/proxy_dmatrix.h"
 #include "../gbm/gbtree_model.h"
 #include "predict_fn.h"
@ -989,7 +995,7 @@ class GPUPredictor : public xgboost::Predictor {
  void PredictInstance(const SparsePage::Inst&,
                       std::vector<bst_float>*,
-                       const gbm::GBTreeModel&, unsigned) const override {
+                       const gbm::GBTreeModel&, unsigned, bool) const override {
    LOG(FATAL) << "[Internal error]: " << __func__
               << " is not implemented in GPU Predictor.";
  }
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@ -24,7 +24,7 @@ set -x
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.02
+RAPIDS_VERSION=23.04
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
--- a/tests/buildkite/update-rapids.sh
+++ b/tests/buildkite/update-rapids.sh
@ -0,0 +1,10 @@
 #!/bin/bash
 set -euo pipefail
 LATEST_RAPIDS_VERSION=$(gh api repos/rapidsai/cuml/releases/latest --jq '.name' | sed -e 's/^v\([[:digit:]]\+\.[[:digit:]]\+\).*/\1/')
 echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION"
 PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@ -18,8 +18,17 @@ rm -rf $(find . -name target)
 rm -rf ../build/
 # Re-build package without Mock Rabit
 # Maven profiles:
 # `default`           includes modules: xgboost4j, xgboost4j-spark, xgboost4j-flink, xgboost4j-example
 # `gpu`               includes modules: xgboost4j-gpu, xgboost4j-spark-gpu, sets `use.cuda = ON`
 # `scala-2.13`        sets the scala binary version to the 2.13
 # `release-to-s3`     sets maven deployment targets
 # Deploy to S3 bucket xgboost-maven-repo
-mvn --no-transfer-progress package deploy -Duse.cuda=ON -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
 # Deploy scala 2.13 to S3 bucket xgboost-maven-repo
 mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
 set +x
 set +e
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@ -90,7 +90,7 @@ def check_cmd_print_failure_assistance(cmd: List[str]) -> bool:
    subprocess.run([cmd[0], "--version"])
    msg = """
-Please run the following command on your machine to address the formatting error:
+Please run the following command on your machine to address the error:
    """
    msg += " ".join(cmd)
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@ -17,34 +17,30 @@
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json, String, Object
-namespace xgboost {
+namespace xgboost::metric {
 namespace metric {
 inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
  // When the limit for precision is not given, it takes the limit at
  // std::numeric_limits<unsigned>::max(); hence all values are very small
  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
+  std::unique_ptr<xgboost::Metric> metric{Metric::Create("pre", &ctx)};
  ASSERT_STREQ(metric->Name(), "pre");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
+  EXPECT_NEAR(
-                            {0.1f, 0.9f, 0.1f, 0.9f},
+      GetMetricEval(metric.get(), {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode),
-                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+      0.5, 1e-7);
              0, 1e-7);
-  delete metric;
+  metric.reset(xgboost::Metric::Create("pre@2", &ctx));
  metric = xgboost::Metric::Create("pre@2", &ctx);
  ASSERT_STREQ(metric->Name(), "pre@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
+  EXPECT_NEAR(
-                            {0.1f, 0.9f, 0.1f, 0.9f},
+      GetMetricEval(metric.get(), {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode),
-                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+      0.5f, 0.001f);
              0.5f, 0.001f);
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
+  EXPECT_ANY_THROW(GetMetricEval(metric.get(), {0, 1}, {}, {}, {}, data_split_mode));
-  delete metric;
+  metric.reset(xgboost::Metric::Create("pre@4", &ctx));
  EXPECT_NEAR(GetMetricEval(metric.get(), {0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f},
                            {0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f}, {}, {}, data_split_mode),
              0.5f, 1e-7);
 }
 inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
@ -187,5 +183,4 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo
  ndcg = metric->Evaluate(predt, p_fmat);
  ASSERT_NEAR(ndcg, 1.0, kRtEps);
 }
-}  // namespace metric
+}  // namespace xgboost::metric
 }  // namespace xgboost
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@ -17,13 +17,15 @@
 #include "test_predictor.h"
 namespace xgboost {
-TEST(CpuPredictor, Basic) {
+
 namespace {
 void TestBasic(DMatrix* dmat) {
  auto lparam = CreateEmptyGenericParam(GPUIDX);
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
-  size_t constexpr kRows = 5;
+  size_t const kRows = dmat->Info().num_row_;
-  size_t constexpr kCols = 5;
+  size_t const kCols = dmat->Info().num_col_;
  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
@ -31,12 +33,10 @@ TEST(CpuPredictor, Basic) {
  ctx.UpdateAllowUnknown(Args{});
  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
  // Test predict batch
  PredictionCacheEntry out_predictions;
  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
-  cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+  cpu_predictor->PredictBatch(dmat, &out_predictions, model, 0);
  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
@ -44,26 +44,32 @@ TEST(CpuPredictor, Basic) {
  }
  // Test predict instance
-  auto const &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
+  auto const& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
  auto page = batch.GetView();
  for (size_t i = 0; i < batch.Size(); i++) {
    std::vector<float> instance_out_predictions;
-    cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model);
+    cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model, 0,
                                   dmat->Info().IsColumnSplit());
    ASSERT_EQ(instance_out_predictions[0], 1.5);
  }
  // Test predict leaf
  HostDeviceVector<float> leaf_out_predictions;
-  cpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
+  cpu_predictor->PredictLeaf(dmat, &leaf_out_predictions, model);
  auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
  for (auto v : h_leaf_out_predictions) {
    ASSERT_EQ(v, 0);
  }
  if (dmat->Info().IsColumnSplit()) {
    // Predict contribution is not supported for column split.
    return;
  }
  // Test predict contribution
  HostDeviceVector<float> out_contribution_hdv;
  auto& out_contribution = out_contribution_hdv.HostVector();
-  cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model);
+  cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model);
  ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1));
  for (size_t i = 0; i < out_contribution.size(); ++i) {
    auto const& contri = out_contribution[i];
@ -76,8 +82,7 @@ TEST(CpuPredictor, Basic) {
    }
  }
  // Test predict contribution (approximate method)
-  cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model,
+  cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true);
                                     0, nullptr, true);
  for (size_t i = 0; i < out_contribution.size(); ++i) {
    auto const& contri = out_contribution[i];
    // shift 1 for bias, as test tree is a decision dump, only global bias is
@ -89,41 +94,32 @@ TEST(CpuPredictor, Basic) {
    }
  }
 }
 }  // anonymous namespace
-namespace {
+TEST(CpuPredictor, Basic) {
 void TestColumnSplitPredictBatch() {
  size_t constexpr kRows = 5;
  size_t constexpr kCols = 5;
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
  TestBasic(dmat.get());
 }
 namespace {
 void TestColumnSplit() {
  size_t constexpr kRows = 5;
  size_t constexpr kCols = 5;
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
  auto const world_size = collective::GetWorldSize();
  auto const rank = collective::GetRank();
  dmat = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};
-  auto lparam = CreateEmptyGenericParam(GPUIDX);
+  TestBasic(dmat.get());
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
  Context ctx;
  ctx.UpdateAllowUnknown(Args{});
  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
  // Test predict batch
  PredictionCacheEntry out_predictions;
  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
  auto sliced = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};
  cpu_predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
    ASSERT_EQ(out_predictions_h[i], 1.5);
  }
 }
 }  // anonymous namespace
-TEST(CpuPredictor, ColumnSplit) {
+TEST(CpuPredictor, ColumnSplitBasic) {
  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplitPredictBatch);
+  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit);
 }
 TEST(CpuPredictor, IterationRange) {
@ -133,69 +129,8 @@ TEST(CpuPredictor, IterationRange) {
 TEST(CpuPredictor, ExternalMemory) {
  size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
-  auto lparam = CreateEmptyGenericParam(GPUIDX);
+  TestBasic(dmat.get());
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
  LearnerModelParam mparam{MakeMP(dmat->Info().num_col_, .0, 1)};
  Context ctx;
  ctx.UpdateAllowUnknown(Args{});
  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
  // Test predict batch
  PredictionCacheEntry out_predictions;
  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
  cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
  std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
  ASSERT_EQ(out_predictions.predictions.Size(), dmat->Info().num_row_);
  for (const auto& v : out_predictions_h) {
    ASSERT_EQ(v, 1.5);
  }
  // Test predict leaf
  HostDeviceVector<float> leaf_out_predictions;
  cpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
  auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
  ASSERT_EQ(h_leaf_out_predictions.size(), dmat->Info().num_row_);
  for (const auto& v : h_leaf_out_predictions) {
    ASSERT_EQ(v, 0);
  }
  // Test predict contribution
  HostDeviceVector<float> out_contribution_hdv;
  auto& out_contribution = out_contribution_hdv.HostVector();
  cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model);
  ASSERT_EQ(out_contribution.size(), dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
  for (size_t i = 0; i < out_contribution.size(); ++i) {
    auto const& contri = out_contribution[i];
    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
    if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
      ASSERT_EQ(out_contribution.back(), 1.5f);
    } else {
      ASSERT_EQ(contri, 0);
    }
  }
  // Test predict contribution (approximate method)
  HostDeviceVector<float> out_contribution_approximate_hdv;
  auto& out_contribution_approximate = out_contribution_approximate_hdv.HostVector();
  cpu_predictor->PredictContribution(
      dmat.get(), &out_contribution_approximate_hdv, model, 0, nullptr, true);
  ASSERT_EQ(out_contribution_approximate.size(),
            dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
  for (size_t i = 0; i < out_contribution.size(); ++i) {
    auto const& contri = out_contribution[i];
    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
    if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
      ASSERT_EQ(out_contribution.back(), 1.5f);
    } else {
      ASSERT_EQ(contri, 0);
    }
  }
 }
 TEST(CpuPredictor, InplacePredict) {
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@ -5,7 +5,7 @@ import pytest
 import xgboost
 from xgboost import testing as tm
-from xgboost.testing.metrics import check_quantile_error
+from xgboost.testing.metrics import check_precision_score, check_quantile_error
 sys.path.append("tests/python")
 import test_eval_metrics as test_em  # noqa
@ -59,6 +59,9 @@ class TestGPUEvalMetrics:
    def test_pr_auc_ltr(self):
        self.cpu_test.run_pr_auc_ltr("gpu_hist")
    def test_precision_score(self):
        check_precision_score("gpu_hist")
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_quantile_error(self) -> None:
        check_quantile_error("gpu_hist")
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@ -3,7 +3,7 @@ import pytest
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.metrics import check_quantile_error
+from xgboost.testing.metrics import check_precision_score, check_quantile_error
 rng = np.random.RandomState(1337)
@ -315,6 +315,9 @@ class TestEvalMetrics:
    def test_pr_auc_ltr(self):
        self.run_pr_auc_ltr("hist")
    def test_precision_score(self):
        check_precision_score("hist")
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_quantile_error(self) -> None:
        check_quantile_error("hist")
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@ -55,6 +55,38 @@ class TestQuantileDMatrix:
        r = np.arange(1.0, n_samples)
        np.testing.assert_allclose(Xy.get_data().toarray()[1:, 0], r)
    def test_error(self):
        from sklearn.model_selection import train_test_split
        rng = np.random.default_rng(1994)
        X, y = make_categorical(
            n_samples=128, n_features=2, n_categories=3, onehot=False
        )
        reg = xgb.XGBRegressor(tree_method="hist", enable_categorical=True)
        w = rng.uniform(0, 1, size=y.shape[0])
        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
            X, y, w, random_state=1994
        )
        with pytest.raises(ValueError, match="sample weight"):
            reg.fit(
                X,
                y,
                sample_weight=w_train,
                eval_set=[(X_test, y_test)],
                sample_weight_eval_set=[w_test],
            )
        with pytest.raises(ValueError, match="sample weight"):
            reg.fit(
                X_train,
                y_train,
                sample_weight=w,
                eval_set=[(X_test, y_test)],
                sample_weight_eval_set=[w_test],
            )
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9])
    def test_with_iterator(self, sparsity: float) -> None:
        n_samples_per_batch = 317