Quantile DMatrix for CPU. (#8130)

- Add a new `QuantileDMatrix` that works for both CPU and GPU. - Deprecate `DeviceQuantileDMatrix`.
2022-08-02 15:51:23 +08:00 · 2022-08-02 15:51:23 +08:00 · d87f69215e
commit d87f69215e
parent 2cba1d9fcc
14 changed files with 521 additions and 117 deletions
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@ -22,6 +22,9 @@ Core Data Structure
    :members:
    :show-inheritance:
 .. autoclass:: xgboost.QuantileDMatrix
    :show-inheritance:
 .. autoclass:: xgboost.DeviceQuantileDMatrix
    :show-inheritance:
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -415,28 +415,26 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
 *
 * \return 0 when success, -1 when failure happens
 */
-XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
+XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
-                                        DMatrixHandle proxy,
+                                        DataIterResetCallback *reset, XGDMatrixCallbackNext *next,
-                                        DataIterResetCallback *reset,
+                                        char const *c_json_config, DMatrixHandle *out);
                                        XGDMatrixCallbackNext *next,
                                        char const* c_json_config,
                                        DMatrixHandle *out);
 /*!
 * \brief Create a Quantile DMatrix with data iterator.
 *
- * Short note for how to use the second set of callback for GPU Hist tree method:
+ * Short note for how to use the second set of callback for (GPU)Hist tree method:
 *
 * - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
 * - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
 * - Step 2: Pass the iterator handle, proxy handle and 2 methods into
- *           `XGDeviceQuantileDMatrixCreateFromCallback`.
+ *           `XGQuantileDMatrixCreateFromCallback`.
 * - Step 3: Call appropriate data setters in `next` functions.
 *
- * See test_iterative_device_dmatrix.cu or Python interface for examples.
+ * See test_iterative_dmatrix.cu or Python interface for examples.
 *
 * \param iter     A handle to external data iterator.
 * \param proxy    A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
 * \param ref      Reference DMatrix for providing quantile information.
 * \param reset    Callback function resetting the iterator state.
 * \param next     Callback function yielding the next batch of data.
 * \param missing  Which value to represent missing value
@ -446,10 +444,20 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
 *
 * \return 0 when success, -1 when failure happens
 */
-XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
+XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
-    DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
+                                                DataIterHandle ref, DataIterResetCallback *reset,
-    XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
+                                                XGDMatrixCallbackNext *next, char const *config,
-    DMatrixHandle *out);
+                                                DMatrixHandle *out);
 /*!
 * \brief Create a Device Quantile DMatrix with data iterator.
 * \deprecated since 2.0
 * \see XGQuantileDMatrixCreateFromCallback()
 */
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
                                                      DataIterResetCallback *reset,
                                                      XGDMatrixCallbackNext *next, float missing,
                                                      int nthread, int max_bin, DMatrixHandle *out);
 /*!
 * \brief Set data on a DMatrix proxy.
--- a/python-package/xgboost/init.py
+++ b/python-package/xgboost/init.py
@ -6,6 +6,7 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
 from .core import (
    DMatrix,
    DeviceQuantileDMatrix,
    QuantileDMatrix,
    Booster,
    DataIter,
    build_info,
@ -33,6 +34,7 @@ __all__ = [
    # core
    "DMatrix",
    "DeviceQuantileDMatrix",
    "QuantileDMatrix",
    "Booster",
    "DataIter",
    "train",
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -1146,7 +1146,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes
        Parameters
        ----------
-        feature_types : list or None
+        feature_types :
            Labels for features. None will reset existing feature names
        """
@ -1189,7 +1189,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes
 class _ProxyDMatrix(DMatrix):
-    """A placeholder class when DMatrix cannot be constructed (DeviceQuantileDMatrix,
+    """A placeholder class when DMatrix cannot be constructed (QuantileDMatrix,
    inplace_predict).
    """
@ -1234,17 +1234,35 @@ class _ProxyDMatrix(DMatrix):
        )
-class DeviceQuantileDMatrix(DMatrix):
+class QuantileDMatrix(DMatrix):
-    """Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do
+    """A DMatrix variant that generates quantilized data directly from input for
-    not use this for test/validation tasks as some information may be lost in
+    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
-    quantisation. This DMatrix is primarily designed to save memory in training from
+    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
-    device memory inputs by avoiding intermediate storage. Set max_bin to control the
+    number of bins during quantisation, which should be consistent with the training
-    number of bins during quantisation.  See doc string in :py:obj:`xgboost.DMatrix` for
+    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
-    documents on meta info.
+    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
    it defeats the purpose of saving memory) constructed from training dataset.  See
    :py:obj:`xgboost.DMatrix` for documents on meta info.
-    You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
+    .. note::
-    .. versionadded:: 1.1.0
+        Do not use ``QuantileDMatrix`` as validation/test dataset without supplying a
        reference (the training dataset) ``QuantileDMatrix`` using ``ref`` as some
        information may be lost in quantisation.
    .. versionadded:: 2.0.0
    Parameters
    ----------
    max_bin :
        The number of histogram bin, should be consistent with the training parameter
        ``max_bin``.
    ref :
        The training dataset that provides quantile information, needed when creating
        validation/test dataset with ``QuantileDMatrix``. Supplying the training DMatrix
        as a reference means that the same quantisation applied to the training data is
        applied to the validation/test data
    """
@ -1261,7 +1279,8 @@ class DeviceQuantileDMatrix(DMatrix):
        feature_names: Optional[FeatureNames] = None,
        feature_types: Optional[FeatureTypes] = None,
        nthread: Optional[int] = None,
-        max_bin: int = 256,
+        max_bin: Optional[int] = None,
        ref: Optional[DMatrix] = None,
        group: Optional[ArrayLike] = None,
        qid: Optional[ArrayLike] = None,
        label_lower_bound: Optional[ArrayLike] = None,
@ -1269,9 +1288,9 @@ class DeviceQuantileDMatrix(DMatrix):
        feature_weights: Optional[ArrayLike] = None,
        enable_categorical: bool = False,
    ) -> None:
-        self.max_bin = max_bin
+        self.max_bin: int = max_bin if max_bin is not None else 256
        self.missing = missing if missing is not None else np.nan
-        self.nthread = nthread if nthread is not None else 1
+        self.nthread = nthread if nthread is not None else -1
        self._silent = silent  # unused, kept for compatibility
        if isinstance(data, ctypes.c_void_p):
@ -1280,12 +1299,13 @@ class DeviceQuantileDMatrix(DMatrix):
        if qid is not None and group is not None:
            raise ValueError(
-                'Only one of the eval_qid or eval_group for each evaluation '
+                "Only one of the eval_qid or eval_group for each evaluation "
-                'dataset should be provided.'
+                "dataset should be provided."
            )
        self._init(
            data,
            ref=ref,
            label=label,
            weight=weight,
            base_margin=base_margin,
@ -1299,7 +1319,13 @@ class DeviceQuantileDMatrix(DMatrix):
            enable_categorical=enable_categorical,
        )
-    def _init(self, data: DataType, enable_categorical: bool, **meta: Any) -> None:
+    def _init(
        self,
        data: DataType,
        ref: Optional[DMatrix],
        enable_categorical: bool,
        **meta: Any,
    ) -> None:
        from .data import (
            _is_dlpack,
            _transform_dlpack,
@ -1317,20 +1343,26 @@ class DeviceQuantileDMatrix(DMatrix):
            it = SingleBatchInternalIter(data=data, **meta)
        handle = ctypes.c_void_p()
-        reset_callback, next_callback = it.get_callbacks(False, enable_categorical)
+        reset_callback, next_callback = it.get_callbacks(True, enable_categorical)
        if it.cache_prefix is not None:
            raise ValueError(
-                "DeviceQuantileDMatrix doesn't cache data, remove the cache_prefix "
+                "QuantileDMatrix doesn't cache data, remove the cache_prefix "
                "in iterator to fix this error."
            )
-        ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
+
        args = {
            "nthread": self.nthread,
            "missing": self.missing,
            "max_bin": self.max_bin,
        }
        config = from_pystr_to_cstr(json.dumps(args))
        ret = _LIB.XGQuantileDMatrixCreateFromCallback(
            None,
            it.proxy.handle,
            ref.handle if ref is not None else ref,
            reset_callback,
            next_callback,
-            ctypes.c_float(self.missing),
+            config,
            ctypes.c_int(self.nthread),
            ctypes.c_int(self.max_bin),
            ctypes.byref(handle),
        )
        it.reraise()
@ -1339,6 +1371,20 @@ class DeviceQuantileDMatrix(DMatrix):
        self.handle = handle
 class DeviceQuantileDMatrix(QuantileDMatrix):
    """ Use `QuantileDMatrix` instead.
    .. deprecated:: 2.0.0
    .. versionadded:: 1.1.0
    """
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        warnings.warn("Please use `QuantileDMatrix` instead.", FutureWarning)
        super().__init__(*args, **kwargs)
 Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
 Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@ -35,6 +35,7 @@ import collections
 import logging
 import platform
 import socket
 import warnings
 from collections import defaultdict
 from contextlib import contextmanager
 from functools import partial, update_wrapper
@ -64,10 +65,10 @@ from .compat import DataFrame, LazyLoader, concat, lazy_isinstance
 from .core import (
    Booster,
    DataIter,
    DeviceQuantileDMatrix,
    DMatrix,
    Metric,
    Objective,
    QuantileDMatrix,
    _deprecate_positional_args,
    _expect,
    _has_categorical,
@ -495,7 +496,7 @@ async def map_worker_partitions(
    client: Optional["distributed.Client"],
    func: Callable[..., _MapRetT],
    *refs: Any,
-    workers: List[str],
+    workers: Sequence[str],
 ) -> List[_MapRetT]:
    """Map a function onto partitions of each worker."""
    # Note for function purity:
@ -628,22 +629,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
        return 1
-class DaskDeviceQuantileDMatrix(DaskDMatrix):
+class DaskQuantileDMatrix(DaskDMatrix):
    """Specialized data type for `gpu_hist` tree method.  This class is used to reduce
    the memory usage by eliminating data copies.  Internally the all partitions/chunks
    of data are merged by weighted GK sketching.  So the number of partitions from dask
    may affect training accuracy as GK generates bounded error for each merge.  See doc
    string for :py:obj:`xgboost.DeviceQuantileDMatrix` and :py:obj:`xgboost.DMatrix` for
    other parameters.
    .. versionadded:: 1.2.0
    Parameters
    ----------
    max_bin : Number of bins for histogram construction.
    """
    @_deprecate_positional_args
    def __init__(
        self,
@ -657,7 +643,8 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
        silent: bool = False,  # disable=unused-argument
        feature_names: Optional[FeatureNames] = None,
        feature_types: Optional[Union[Any, List[Any]]] = None,
-        max_bin: int = 256,
+        max_bin: Optional[int] = None,
        ref: Optional[DMatrix] = None,
        group: Optional[_DaskCollection] = None,
        qid: Optional[_DaskCollection] = None,
        label_lower_bound: Optional[_DaskCollection] = None,
@ -684,14 +671,31 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
        )
        self.max_bin = max_bin
        self.is_quantile = True
        self._ref: Optional[int] = id(ref) if ref is not None else None
    def _create_fn_args(self, worker_addr: str) -> Dict[str, Any]:
        args = super()._create_fn_args(worker_addr)
        args["max_bin"] = self.max_bin
        if self._ref is not None:
            args["ref"] = self._ref
        return args
-def _create_device_quantile_dmatrix(
+class DaskDeviceQuantileDMatrix(DaskQuantileDMatrix):
    """Use `DaskQuantileDMatrix` instead.
    .. deprecated:: 2.0.0
    .. versionadded:: 1.2.0
    """
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        warnings.warn("Please use `DaskQuantileDMatrix` instead.", FutureWarning)
        super().__init__(*args, **kwargs)
 def _create_quantile_dmatrix(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[Union[Any, List[Any]]],
    feature_weights: Optional[Any],
@ -700,18 +704,20 @@ def _create_device_quantile_dmatrix(
    parts: Optional[_DataParts],
    max_bin: int,
    enable_categorical: bool,
-) -> DeviceQuantileDMatrix:
+    ref: Optional[DMatrix] = None,
 ) -> QuantileDMatrix:
    worker = distributed.get_worker()
    if parts is None:
        msg = f"worker {worker.address} has an empty DMatrix."
        LOGGER.warning(msg)
        import cupy
-        d = DeviceQuantileDMatrix(
+        d = QuantileDMatrix(
            cupy.zeros((0, 0)),
            feature_names=feature_names,
            feature_types=feature_types,
            max_bin=max_bin,
            ref=ref,
            enable_categorical=enable_categorical,
        )
        return d
@ -719,13 +725,14 @@ def _create_device_quantile_dmatrix(
    unzipped_dict = _get_worker_parts(parts)
    it = DaskPartitionIter(**unzipped_dict)
-    dmatrix = DeviceQuantileDMatrix(
+    dmatrix = QuantileDMatrix(
        it,
        missing=missing,
        feature_names=feature_names,
        feature_types=feature_types,
        nthread=nthread,
        max_bin=max_bin,
        ref=ref,
        enable_categorical=enable_categorical,
    )
    dmatrix.set_info(feature_weights=feature_weights)
@ -786,11 +793,9 @@ def _create_dmatrix(
    return dmatrix
-def _dmatrix_from_list_of_parts(
+def _dmatrix_from_list_of_parts(is_quantile: bool, **kwargs: Any) -> DMatrix:
    is_quantile: bool, **kwargs: Any
 ) -> Union[DMatrix, DeviceQuantileDMatrix]:
    if is_quantile:
-        return _create_device_quantile_dmatrix(**kwargs)
+        return _create_quantile_dmatrix(**kwargs)
    return _create_dmatrix(**kwargs)
@ -921,7 +926,18 @@ async def _train_async(
                if evals_id[i] == train_id:
                    evals.append((Xy, evals_name[i]))
                    continue
-                eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
+                if ref.get("ref", None) is not None:
                    if ref["ref"] != train_id:
                        raise ValueError(
                            "The training DMatrix should be used as a reference"
                            " to evaluation `QuantileDMatrix`."
                        )
                    del ref["ref"]
                    eval_Xy = _dmatrix_from_list_of_parts(
                        **ref, nthread=n_threads, ref=Xy
                    )
                else:
                    eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
                evals.append((eval_Xy, evals_name[i]))
            booster = worker_train(
@ -960,12 +976,14 @@ async def _train_async(
        results = await map_worker_partitions(
            client,
            dispatched_train,
            # extra function parameters
            params,
            _rabit_args,
            id(dtrain),
            evals_name,
            evals_id,
            *([dtrain] + evals_data),
            # workers to be used for training
            workers=workers,
        )
        return list(filter(lambda ret: ret is not None, results))[0]
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -1167,6 +1167,7 @@ def _proxy_transform(
    if _is_dlpack(data):
        return _transform_dlpack(data), None, feature_names, feature_types
    if _is_numpy_array(data):
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
        return data, None, feature_names, feature_types
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -281,11 +281,36 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
                                                      int nthread, int max_bin,
                                                      DMatrixHandle *out) {
  API_BEGIN();
  LOG(WARNING) << __func__ << " is deprecated. Use `XGQuantileDMatrixCreateFromCallback` instead.";
  *out = new std::shared_ptr<xgboost::DMatrix>{
      xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)};
  API_END();
 }
 XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
                                                DataIterHandle ref, DataIterResetCallback *reset,
                                                XGDMatrixCallbackNext *next, char const *config,
                                                DMatrixHandle *out) {
  API_BEGIN();
  std::shared_ptr<DMatrix> _ref{nullptr};
  if (ref) {
    auto pp_ref = static_cast<std::shared_ptr<xgboost::DMatrix> *>(ref);
    StringView err{"Invalid handle to ref."};
    CHECK(pp_ref) << err;
    _ref = *pp_ref;
    CHECK(_ref) << err;
  }
  auto jconfig = Json::Load(StringView{config});
  auto missing = GetMissing(jconfig);
  auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", common::OmpGetNumThreads(0));
  auto max_bin = OptionalArg<Integer, int64_t>(jconfig, "max_bin", 256);
  *out = new std::shared_ptr<xgboost::DMatrix>{
      xgboost::DMatrix::Create(iter, proxy, _ref, reset, next, missing, n_threads, max_bin)};
  API_END();
 }
 XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
  API_BEGIN();
  *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@ -1,4 +1,3 @@
 # -*- coding: utf-8 -*-
 import numpy as np
 import xgboost as xgb
 import pytest
@ -6,16 +5,14 @@ import sys
 sys.path.append("tests/python")
 import testing as tm
 import test_quantile_dmatrix as tqd
 class TestDeviceQuantileDMatrix:
-    def test_dmatrix_numpy_init(self):
+    cputest = tqd.TestQuantileDMatrix()
        data = np.random.randn(5, 5)
        with pytest.raises(TypeError, match='is not supported'):
            xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))
    @pytest.mark.skipif(**tm.no_cupy())
-    def test_dmatrix_feature_weights(self):
+    def test_dmatrix_feature_weights(self) -> None:
        import cupy as cp
        rng = cp.random.RandomState(1994)
        data = rng.randn(5, 5)
@ -29,7 +26,7 @@ class TestDeviceQuantileDMatrix:
            feature_weights.astype(np.float32))
    @pytest.mark.skipif(**tm.no_cupy())
-    def test_dmatrix_cupy_init(self):
+    def test_dmatrix_cupy_init(self) -> None:
        import cupy as cp
        data = cp.random.randn(5, 5)
        xgb.DeviceQuantileDMatrix(data, cp.ones(5, dtype=np.float64))
@ -55,3 +52,10 @@ class TestDeviceQuantileDMatrix:
        cp.testing.assert_allclose(fw, got_fw)
        cp.testing.assert_allclose(labels, got_labels)
    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_cudf())
    def test_ref_dmatrix(self) -> None:
        import cupy as cp
        rng = cp.random.RandomState(1994)
        self.cputest.run_ref_dmatrix(rng, "gpu_hist", False)
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@ -429,9 +429,10 @@ class TestDistributedGPU:
        sig = OrderedDict(signature(dxgb.DaskDMatrix).parameters)
        del sig["client"]
        ddm_names = list(sig.keys())
-        sig = OrderedDict(signature(dxgb.DaskDeviceQuantileDMatrix).parameters)
+        sig = OrderedDict(signature(dxgb.DaskQuantileDMatrix).parameters)
        del sig["client"]
        del sig["max_bin"]
        del sig["ref"]
        ddqdm_names = list(sig.keys())
        assert len(ddm_names) == len(ddqdm_names)
@ -442,9 +443,10 @@ class TestDistributedGPU:
        sig = OrderedDict(signature(xgb.DMatrix).parameters)
        del sig["nthread"]      # no nthread in dask
        dm_names = list(sig.keys())
-        sig = OrderedDict(signature(xgb.DeviceQuantileDMatrix).parameters)
+        sig = OrderedDict(signature(xgb.QuantileDMatrix).parameters)
        del sig["nthread"]
        del sig["max_bin"]
        del sig["ref"]
        dqdm_names = list(sig.keys())
        # between single node
@ -499,7 +501,6 @@ class TestDistributedGPU:
            for arg in rabit_args:
                if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
                    port_env = arg.decode('utf-8')
                    port_env = arg.decode('utf-8')
                if arg.decode("utf-8").startswith("DMLC_TRACKER_URI"):
                    uri_env = arg.decode("utf-8")
            port = port_env.split('=')
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@ -1,32 +1,12 @@
 import xgboost as xgb
 from xgboost.data import SingleBatchInternalIter as SingleBatch
 import numpy as np
-from testing import IteratorForTest, non_increasing
+from testing import IteratorForTest, non_increasing, make_batches
 from typing import Tuple, List
 import pytest
 from hypothesis import given, strategies, settings
 from scipy.sparse import csr_matrix
 def make_batches(
    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
 ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
    X = []
    y = []
    if use_cupy:
        import cupy
        rng = cupy.random.RandomState(1994)
    else:
        rng = np.random.RandomState(1994)
    for i in range(n_batches):
        _X = rng.randn(n_samples_per_batch, n_features)
        _y = rng.randn(n_samples_per_batch)
        X.append(_X)
        y.append(_y)
    return X, y
 def test_single_batch(tree_method: str = "approx") -> None:
    from sklearn.datasets import load_breast_cancer
@ -111,8 +91,8 @@ def run_data_iterator(
    if not subsample:
        assert non_increasing(results_from_it["Train"]["rmse"])
-    X, y = it.as_arrays()
+    X, y, w = it.as_arrays()
-    Xy = xgb.DMatrix(X, y)
+    Xy = xgb.DMatrix(X, y, weight=w)
    assert Xy.num_row() == n_samples_per_batch * n_batches
    assert Xy.num_col() == n_features
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@ -0,0 +1,212 @@
 from typing import Dict, List, Any
 import numpy as np
 import pytest
 from scipy import sparse
 from testing import IteratorForTest, make_batches, make_batches_sparse, make_categorical
 import xgboost as xgb
 class TestQuantileDMatrix:
    def test_basic(self) -> None:
        n_samples = 234
        n_features = 8
        rng = np.random.default_rng()
        X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
            n_samples, n_features
        )
        y = rng.normal(0, 3, size=n_samples)
        Xy = xgb.QuantileDMatrix(X, y)
        assert Xy.num_row() == n_samples
        assert Xy.num_col() == n_features
        X = sparse.random(n_samples, n_features, density=0.1, format="csr")
        Xy = xgb.QuantileDMatrix(X, y)
        assert Xy.num_row() == n_samples
        assert Xy.num_col() == n_features
        X = sparse.random(n_samples, n_features, density=0.8, format="csr")
        Xy = xgb.QuantileDMatrix(X, y)
        assert Xy.num_row() == n_samples
        assert Xy.num_col() == n_features
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9])
    def test_with_iterator(self, sparsity: float) -> None:
        n_samples_per_batch = 317
        n_features = 8
        n_batches = 7
        if sparsity == 0.0:
            it = IteratorForTest(
                *make_batches(n_samples_per_batch, n_features, n_batches, False), None
            )
        else:
            it = IteratorForTest(
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
                None
            )
        Xy = xgb.QuantileDMatrix(it)
        assert Xy.num_row() == n_samples_per_batch * n_batches
        assert Xy.num_col() == n_features
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
    def test_training(self, sparsity: float) -> None:
        n_samples_per_batch = 317
        n_features = 8
        n_batches = 7
        if sparsity == 0.0:
            it = IteratorForTest(
                *make_batches(n_samples_per_batch, n_features, n_batches, False), None
            )
        else:
            it = IteratorForTest(
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
                None
            )
        parameters = {"tree_method": "hist", "max_bin": 256}
        Xy_it = xgb.QuantileDMatrix(it, max_bin=parameters["max_bin"])
        from_it = xgb.train(parameters, Xy_it)
        X, y, w = it.as_arrays()
        w_it = Xy_it.get_weight()
        np.testing.assert_allclose(w_it, w)
        Xy_arr = xgb.DMatrix(X, y, weight=w)
        from_arr = xgb.train(parameters, Xy_arr)
        np.testing.assert_allclose(from_arr.predict(Xy_it), from_it.predict(Xy_arr))
        y -= y.min()
        y += 0.01
        Xy = xgb.QuantileDMatrix(X, y, weight=w)
        with pytest.raises(ValueError, match=r"Only.*hist.*"):
            parameters = {
                "tree_method": "approx",
                "max_bin": 256,
                "objective": "reg:gamma",
            }
            xgb.train(parameters, Xy)
    def run_ref_dmatrix(self, rng: Any, tree_method: str, enable_cat: bool) -> None:
        n_samples, n_features = 2048, 17
        if enable_cat:
            X, y = make_categorical(
                n_samples, n_features, n_categories=13, onehot=False
            )
            if tree_method == "gpu_hist":
                import cudf
                X = cudf.from_pandas(X)
                y = cudf.from_pandas(y)
        else:
            X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
                n_samples, n_features
            )
            y = rng.normal(0, 3, size=n_samples)
        # Use ref
        Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
        Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
        qdm_results: Dict[str, Dict[str, List[float]]] = {}
        xgb.train(
            {"tree_method": tree_method},
            Xy,
            evals=[(Xy, "Train"), (Xy_valid, "valid")],
            evals_result=qdm_results,
        )
        np.testing.assert_allclose(
            qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"]
        )
        # No ref
        Xy_valid = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
        qdm_results = {}
        xgb.train(
            {"tree_method": tree_method},
            Xy,
            evals=[(Xy, "Train"), (Xy_valid, "valid")],
            evals_result=qdm_results,
        )
        np.testing.assert_allclose(
            qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"]
        )
        # Different number of features
        Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
        dXy = xgb.DMatrix(X, y, enable_categorical=enable_cat)
        n_samples, n_features = 256, 15
        X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
            n_samples, n_features
        )
        y = rng.normal(0, 3, size=n_samples)
        with pytest.raises(ValueError, match=r".*features\."):
            xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
        # Compare training results
        n_samples, n_features = 256, 17
        if enable_cat:
            X, y = make_categorical(n_samples, n_features, 13, onehot=False)
            if tree_method == "gpu_hist":
                import cudf
                X = cudf.from_pandas(X)
                y = cudf.from_pandas(y)
        else:
            X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
                n_samples, n_features
            )
            y = rng.normal(0, 3, size=n_samples)
        Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
        # use DMatrix as ref
        Xy_valid_d = xgb.QuantileDMatrix(X, y, ref=dXy, enable_categorical=enable_cat)
        dXy_valid = xgb.DMatrix(X, y, enable_categorical=enable_cat)
        qdm_results = {}
        xgb.train(
            {"tree_method": tree_method},
            Xy,
            evals=[(Xy, "Train"), (Xy_valid, "valid")],
            evals_result=qdm_results,
        )
        dm_results: Dict[str, Dict[str, List[float]]] = {}
        xgb.train(
            {"tree_method": tree_method},
            dXy,
            evals=[(dXy, "Train"), (dXy_valid, "valid"), (Xy_valid_d, "dvalid")],
            evals_result=dm_results,
        )
        np.testing.assert_allclose(
            dm_results["Train"]["rmse"], qdm_results["Train"]["rmse"]
        )
        np.testing.assert_allclose(
            dm_results["valid"]["rmse"], qdm_results["valid"]["rmse"]
        )
        np.testing.assert_allclose(
            dm_results["dvalid"]["rmse"], qdm_results["valid"]["rmse"]
        )
    def test_ref_dmatrix(self) -> None:
        rng = np.random.RandomState(1994)
        self.run_ref_dmatrix(rng, "hist", True)
        self.run_ref_dmatrix(rng, "hist", False)
    def test_predict(self) -> None:
        n_samples, n_features = 16, 2
        X, y = make_categorical(
            n_samples, n_features, n_categories=13, onehot=False
        )
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "hist"}, Xy)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        a = booster.predict(Xy)
        qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
        b = booster.predict(qXy)
        np.testing.assert_allclose(a, b)
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@ -1382,6 +1382,42 @@ class TestWithDask:
        num_rounds = 30
        self.run_updater_test(client, params, num_rounds, dataset, 'hist')
    def test_quantile_dmatrix(self, client: Client) -> None:
        X, y = make_categorical(client, 10000, 30, 13)
        Xy = xgb.dask.DaskDMatrix(client, X, y, enable_categorical=True)
        valid_Xy = xgb.dask.DaskDMatrix(client, X, y, enable_categorical=True)
        output = xgb.dask.train(
            client,
            {"tree_method": "hist"},
            Xy,
            num_boost_round=10,
            evals=[(Xy, "Train"), (valid_Xy, "Valid")]
        )
        dmatrix_hist = output["history"]
        Xy = xgb.dask.DaskQuantileDMatrix(client, X, y, enable_categorical=True)
        valid_Xy = xgb.dask.DaskQuantileDMatrix(
            client, X, y, enable_categorical=True, ref=Xy
        )
        output = xgb.dask.train(
            client,
            {"tree_method": "hist"},
            Xy,
            num_boost_round=10,
            evals=[(Xy, "Train"), (valid_Xy, "Valid")]
        )
        quantile_hist = output["history"]
        np.testing.assert_allclose(
            quantile_hist["Train"]["rmse"], dmatrix_hist["Train"]["rmse"]
        )
        np.testing.assert_allclose(
            quantile_hist["Valid"]["rmse"], dmatrix_hist["Valid"]["rmse"]
        )
    @given(params=exact_parameter_strategy,
           dataset=tm.dataset_strategy)
    @settings(deadline=None, suppress_health_check=suppress, print_blob=True)
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@ -1,11 +1,11 @@
 from concurrent.futures import ThreadPoolExecutor
 import os
 import multiprocessing
-from typing import Tuple, Union
+from typing import Tuple, Union, List, Sequence, Callable
 import urllib
 import zipfile
 import sys
-from typing import Optional
+from typing import Optional, Dict, Any
 from contextlib import contextmanager
 from io import StringIO
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
@ -180,79 +180,148 @@ def skip_s390x():
 class IteratorForTest(xgb.core.DataIter):
-    def __init__(self, X, y):
+    def __init__(
        self,
        X: Sequence,
        y: Sequence,
        w: Optional[Sequence],
        cache: Optional[str] = "./"
    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.w = w
        self.it = 0
-        super().__init__("./")
+        super().__init__(cache)
-    def next(self, input_data):
+    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
            return 0
        # Use copy to make sure the iterator doesn't hold a reference to the data.
-        input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
+        input_data(
-        gc.collect()            # clear up the copy, see if XGBoost access freed memory.
+            data=self.X[self.it].copy(),
            label=self.y[self.it].copy(),
            weight=self.w[self.it].copy() if self.w else None,
        )
        gc.collect()  # clear up the copy, see if XGBoost access freed memory.
        self.it += 1
        return 1
-    def reset(self):
+    def reset(self) -> None:
        self.it = 0
-    def as_arrays(self):
+    def as_arrays(
-        X = np.concatenate(self.X, axis=0)
+        self,
    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], np.ndarray, np.ndarray]:
        if isinstance(self.X[0], sparse.csr_matrix):
            X = sparse.vstack(self.X, format="csr")
        else:
            X = np.concatenate(self.X, axis=0)
        y = np.concatenate(self.y, axis=0)
-        return X, y
+        w = np.concatenate(self.w, axis=0)
        return X, y, w
 def make_batches(
    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
    X = []
    y = []
    w = []
    if use_cupy:
        import cupy
        rng = cupy.random.RandomState(1994)
    else:
        rng = np.random.RandomState(1994)
    for i in range(n_batches):
        _X = rng.randn(n_samples_per_batch, n_features)
        _y = rng.randn(n_samples_per_batch)
        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
        X.append(_X)
        y.append(_y)
        w.append(_w)
    return X, y, w
 def make_batches_sparse(
    n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
 ) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
    X = []
    y = []
    w = []
    rng = np.random.RandomState(1994)
    for i in range(n_batches):
        _X = sparse.random(
            n_samples_per_batch,
            n_features,
            1.0 - sparsity,
            format="csr",
            dtype=np.float32,
            random_state=rng,
        )
        _y = rng.randn(n_samples_per_batch)
        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
        X.append(_X)
        y.append(_y)
        w.append(_w)
    return X, y, w
 # Contains a dataset in numpy format as well as the relevant objective and metric
 class TestDataset:
-    def __init__(self, name, get_dataset, objective, metric):
+    def __init__(
        self, name: str, get_dataset: Callable, objective: str, metric: str
    ) -> None:
        self.name = name
        self.objective = objective
        self.metric = metric
        self.X, self.y = get_dataset()
-        self.w = None
+        self.w: Optional[np.ndarray] = None
        self.margin: Optional[np.ndarray] = None
-    def set_params(self, params_in):
+    def set_params(self, params_in: Dict[str, Any]) -> Dict[str, Any]:
        params_in['objective'] = self.objective
        params_in['eval_metric'] = self.metric
        if self.objective == "multi:softmax":
            params_in["num_class"] = int(np.max(self.y) + 1)
        return params_in
-    def get_dmat(self):
+    def get_dmat(self) -> xgb.DMatrix:
        return xgb.DMatrix(
            self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
        )
-    def get_device_dmat(self):
+    def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
        w = None if self.w is None else cp.array(self.w)
        X = cp.array(self.X, dtype=np.float32)
        y = cp.array(self.y, dtype=np.float32)
        return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
-    def get_external_dmat(self):
+    def get_external_dmat(self) -> xgb.DMatrix:
        n_samples = self.X.shape[0]
        n_batches = 10
        per_batch = n_samples // n_batches + 1
        predictor = []
        response = []
        weight = []
        for i in range(n_batches):
            beg = i * per_batch
            end = min((i + 1) * per_batch, n_samples)
            assert end != beg
            X = self.X[beg: end, ...]
            y = self.y[beg: end]
            w = self.w[beg: end] if self.w is not None else None
            predictor.append(X)
            response.append(y)
            if w is not None:
                weight.append(w)
-        it = IteratorForTest(predictor, response)
+        it = IteratorForTest(predictor, response, weight if weight else None)
        return xgb.DMatrix(it)
-    def __repr__(self):
+    def __repr__(self) -> str:
        return self.name
--- a/tests/python/with_omp_limit.py
+++ b/tests/python/with_omp_limit.py
@ -1,4 +1,3 @@
 import os
 import xgboost as xgb
 from sklearn.datasets import make_classification
 from sklearn.metrics import roc_auc_score