Enhance inplace prediction. (#6653)

* Accept array interface for csr and array. * Accept an optional proxy dmatrix for metainfo. This constructs an explicit `_ProxyDMatrix` type in Python. * Remove unused doc. * Add strict output.
2021-02-02 11:41:46 +08:00
parent 87ab1ad607
commit 411592a347
22 changed files with 955 additions and 530 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -58,21 +58,23 @@ CallbackEnv = collections.namedtuple(
     "evaluation_result_list"])


-def from_pystr_to_cstr(data):
-    """Convert a list of Python str to C pointer
+def from_pystr_to_cstr(data: Union[str, List[str]]):
+    """Convert a Python str or list of Python str to C pointer

    Parameters
    ----------
-    data : list
-        list of str
+    data
+        str or list of str
    """

-    if not isinstance(data, list):
-        raise NotImplementedError
-    pointers = (ctypes.c_char_p * len(data))()
-    data = [bytes(d, 'utf-8') for d in data]
-    pointers[:] = data
-    return pointers
+    if isinstance(data, str):
+        return bytes(data, "utf-8")
+    if isinstance(data, list):
+        pointers = (ctypes.c_char_p * len(data))()
+        data = [bytes(d, 'utf-8') for d in data]
+        pointers[:] = data
+        return pointers
+    raise TypeError()


 def from_cstr_to_pystr(data, length):
@@ -190,21 +192,40 @@ def _check_call(ret):
        raise XGBoostError(py_str(_LIB.XGBGetLastError()))


-def ctypes2numpy(cptr, length, dtype) -> np.ndarray:
-    """Convert a ctypes pointer array to a numpy array."""
-    NUMPY_TO_CTYPES_MAPPING = {
+def _numpy2ctypes_type(dtype):
+    _NUMPY_TO_CTYPES_MAPPING = {
        np.float32: ctypes.c_float,
+        np.float64: ctypes.c_double,
        np.uint32: ctypes.c_uint,
+        np.uint64: ctypes.c_uint64,
+        np.int32: ctypes.c_int32,
+        np.int64: ctypes.c_int64,
    }
-    if dtype not in NUMPY_TO_CTYPES_MAPPING:
-        raise RuntimeError('Supported types: {}'.format(
-            NUMPY_TO_CTYPES_MAPPING.keys()))
-    ctype = NUMPY_TO_CTYPES_MAPPING[dtype]
+    if np.intc is not np.int32:  # Windows
+        _NUMPY_TO_CTYPES_MAPPING[np.intc] = _NUMPY_TO_CTYPES_MAPPING[np.int32]
+    if dtype not in _NUMPY_TO_CTYPES_MAPPING.keys():
+        raise TypeError(
+            f"Supported types: {_NUMPY_TO_CTYPES_MAPPING.keys()}, got: {dtype}"
+        )
+    return _NUMPY_TO_CTYPES_MAPPING[dtype]
+
+
+def _array_interface(data: np.ndarray) -> bytes:
+    interface = data.__array_interface__
+    if "mask" in interface:
+        interface["mask"] = interface["mask"].__array_interface__
+    interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
+    return interface_str
+
+
+def ctypes2numpy(cptr, length, dtype):
+    """Convert a ctypes pointer array to a numpy array."""
+    ctype = _numpy2ctypes_type(dtype)
    if not isinstance(cptr, ctypes.POINTER(ctype)):
-        raise RuntimeError('expected {} pointer'.format(ctype))
+        raise RuntimeError("expected {} pointer".format(ctype))
    res = np.zeros(length, dtype=dtype)
    if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
-        raise RuntimeError('memmove failed')
+        raise RuntimeError("memmove failed")
    return res


@@ -214,25 +235,21 @@ def ctypes2cupy(cptr, length, dtype):
    import cupy
    from cupy.cuda.memory import MemoryPointer
    from cupy.cuda.memory import UnownedMemory
-    CUPY_TO_CTYPES_MAPPING = {
-        cupy.float32: ctypes.c_float,
-        cupy.uint32: ctypes.c_uint
-    }
+
+    CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint}
    if dtype not in CUPY_TO_CTYPES_MAPPING.keys():
-        raise RuntimeError('Supported types: {}'.format(
-            CUPY_TO_CTYPES_MAPPING.keys()
-        ))
+        raise RuntimeError("Supported types: {}".format(CUPY_TO_CTYPES_MAPPING.keys()))
    addr = ctypes.cast(cptr, ctypes.c_void_p).value
    # pylint: disable=c-extension-no-member,no-member
    device = cupy.cuda.runtime.pointerGetAttributes(addr).device
    # The owner field is just used to keep the memory alive with ref count.  As
    # unowned's life time is scoped within this function we don't need that.
    unownd = UnownedMemory(
-        addr, length.value * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]),
-        owner=None)
+        addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None
+    )
    memptr = MemoryPointer(unownd, 0)
    # pylint: disable=unexpected-keyword-arg
-    mem = cupy.ndarray((length.value, ), dtype=dtype, memptr=memptr)
+    mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr)
    assert mem.device.id == device
    arr = cupy.array(mem, copy=True)
    return arr
@@ -256,28 +273,29 @@ def c_str(string):

 def c_array(ctype, values):
    """Convert a python string to c array."""
-    if (isinstance(values, np.ndarray)
-            and values.dtype.itemsize == ctypes.sizeof(ctype)):
+    if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
        return (ctype * len(values)).from_buffer_copy(values)
    return (ctype * len(values))(*values)


+def _prediction_output(shape, dims, predts, is_cuda):
+    arr_shape: np.ndarray = ctypes2numpy(shape, dims.value, np.uint64)
+    length = int(np.prod(arr_shape))
+    if is_cuda:
+        arr_predict = ctypes2cupy(predts, length, np.float32)
+    else:
+        arr_predict: np.ndarray = ctypes2numpy(predts, length, np.float32)
+    arr_predict = arr_predict.reshape(arr_shape)
+    return arr_predict
+
+
 class DataIter:
-    '''The interface for user defined data iterator. Currently is only
-    supported by Device DMatrix.
+    '''The interface for user defined data iterator. Currently is only supported by Device
+    DMatrix.

-    Parameters
-    ----------
-
-    rows : int
-        Total number of rows combining all batches.
-    cols : int
-        Number of columns for each batch.
    '''
    def __init__(self):
-        proxy_handle = ctypes.c_void_p()
-        _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(proxy_handle)))
-        self._handle = DeviceQuantileDMatrix(proxy_handle)
+        self._handle = _ProxyDMatrix()
        self.exception = None

    @property
@@ -300,12 +318,7 @@ class DataIter:
        if self.exception is not None:
            return 0

-        def data_handle(data, label=None, weight=None, base_margin=None,
-                        group=None,
-                        qid=None,
-                        label_lower_bound=None, label_upper_bound=None,
-                        feature_names=None, feature_types=None,
-                        feature_weights=None):
+        def data_handle(data, feature_names=None, feature_types=None, **kwargs):
            from .data import dispatch_device_quantile_dmatrix_set_data
            from .data import _device_quantile_transform
            data, feature_names, feature_types = _device_quantile_transform(
@@ -313,16 +326,9 @@ class DataIter:
            )
            dispatch_device_quantile_dmatrix_set_data(self.proxy, data)
            self.proxy.set_info(
-                label=label,
-                weight=weight,
-                base_margin=base_margin,
-                group=group,
-                qid=qid,
-                label_lower_bound=label_lower_bound,
-                label_upper_bound=label_upper_bound,
                feature_names=feature_names,
                feature_types=feature_types,
-                feature_weights=feature_weights
+                **kwargs,
            )
        try:
            # Differ the exception in order to return 0 and stop the iteration.
@@ -558,7 +564,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes
        feature_types=None,
        feature_weights=None
    ) -> None:
-        """Set meta info for DMatrix.  See doc string for DMatrix constructor."""
+        """Set meta info for DMatrix.  See doc string for :py:obj:`xgboost.DMatrix`."""
        from .data import dispatch_meta_backend

        if label is not None:
@@ -959,76 +965,14 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes
                c_bst_ulong(0)))


-class DeviceQuantileDMatrix(DMatrix):
-    """Device memory Data Matrix used in XGBoost for training with
-    tree_method='gpu_hist'. Do not use this for test/validation tasks as some
-    information may be lost in quantisation. This DMatrix is primarily designed
-    to save memory in training from device memory inputs by avoiding
-    intermediate storage. Set max_bin to control the number of bins during
-    quantisation.  See doc string in `DMatrix` for documents on meta info.
+class _ProxyDMatrix(DMatrix):
+    """A placeholder class when DMatrix cannot be constructed (DeviceQuantileDMatrix,
+    inplace_predict).

-    You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
-
-    .. versionadded:: 1.1.0
    """
-    @_deprecate_positional_args
-    def __init__(              # pylint: disable=super-init-not-called
-        self,
-        data,
-        label=None,
-        *,
-        weight=None,
-        base_margin=None,
-        missing=None,
-        silent=False,
-        feature_names=None,
-        feature_types=None,
-        nthread: Optional[int] = None,
-        max_bin: int = 256,
-        group=None,
-        qid=None,
-        label_lower_bound=None,
-        label_upper_bound=None,
-        feature_weights=None,
-        enable_categorical: bool = False,
-    ):
-        self.max_bin = max_bin
-        self.missing = missing if missing is not None else np.nan
-        self.nthread = nthread if nthread is not None else 1
-        self._silent = silent    # unused, kept for compatibility
-
-        if isinstance(data, ctypes.c_void_p):
-            self.handle = data
-            return
-        from .data import init_device_quantile_dmatrix
-        handle, feature_names, feature_types = init_device_quantile_dmatrix(
-            data,
-            label=label, weight=weight,
-            base_margin=base_margin,
-            group=group,
-            qid=qid,
-            missing=self.missing,
-            label_lower_bound=label_lower_bound,
-            label_upper_bound=label_upper_bound,
-            feature_weights=feature_weights,
-            feature_names=feature_names,
-            feature_types=feature_types,
-            threads=self.nthread,
-            max_bin=self.max_bin,
-        )
-        if enable_categorical:
-            raise NotImplementedError(
-                'categorical support is not enabled on DeviceQuantileDMatrix.'
-            )
-        self.handle = handle
-        if qid is not None and group is not None:
-            raise ValueError(
-                'Only one of the eval_qid or eval_group for each evaluation '
-                'dataset should be provided.'
-            )
-
-        self.feature_names = feature_names
-        self.feature_types = feature_types
+    def __init__(self):  # pylint: disable=super-init-not-called
+        self.handle = ctypes.c_void_p()
+        _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))

    def _set_data_from_cuda_interface(self, data):
        '''Set data from CUDA array interface.'''
@@ -1053,6 +997,116 @@ class DeviceQuantileDMatrix(DMatrix):
        )


+class DeviceQuantileDMatrix(DMatrix):
+    """Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do
+    not use this for test/validation tasks as some information may be lost in
+    quantisation. This DMatrix is primarily designed to save memory in training from
+    device memory inputs by avoiding intermediate storage. Set max_bin to control the
+    number of bins during quantisation.  See doc string in :py:obj:`xgboost.DMatrix` for
+    documents on meta info.
+
+    You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
+
+    .. versionadded:: 1.1.0
+
+    """
+
+    @_deprecate_positional_args
+    def __init__(  # pylint: disable=super-init-not-called
+        self,
+        data,
+        label=None,
+        *,
+        weight=None,
+        base_margin=None,
+        missing=None,
+        silent=False,
+        feature_names=None,
+        feature_types=None,
+        nthread: Optional[int] = None,
+        max_bin: int = 256,
+        group=None,
+        qid=None,
+        label_lower_bound=None,
+        label_upper_bound=None,
+        feature_weights=None,
+        enable_categorical: bool = False,
+    ):
+        self.max_bin = max_bin
+        self.missing = missing if missing is not None else np.nan
+        self.nthread = nthread if nthread is not None else 1
+        self._silent = silent  # unused, kept for compatibility
+
+        if isinstance(data, ctypes.c_void_p):
+            self.handle = data
+            return
+
+        if enable_categorical:
+            raise NotImplementedError(
+                'categorical support is not enabled on DeviceQuantileDMatrix.'
+            )
+        if qid is not None and group is not None:
+            raise ValueError(
+                'Only one of the eval_qid or eval_group for each evaluation '
+                'dataset should be provided.'
+            )
+
+        self._init(
+            data,
+            label=label,
+            weight=weight,
+            base_margin=base_margin,
+            group=group,
+            qid=qid,
+            label_lower_bound=label_lower_bound,
+            label_upper_bound=label_upper_bound,
+            feature_weights=feature_weights,
+            feature_names=feature_names,
+            feature_types=feature_types,
+        )
+
+    def _init(self, data, feature_names, feature_types, **meta):
+        from .data import (
+            _is_dlpack,
+            _transform_dlpack,
+            _is_iter,
+            SingleBatchInternalIter,
+        )
+
+        if _is_dlpack(data):
+            # We specialize for dlpack because cupy will take the memory from it so
+            # it can't be transformed twice.
+            data = _transform_dlpack(data)
+        if _is_iter(data):
+            it = data
+        else:
+            it = SingleBatchInternalIter(
+                data, **meta, feature_names=feature_names, feature_types=feature_types
+            )
+
+        reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(it.reset_wrapper)
+        next_callback = ctypes.CFUNCTYPE(
+            ctypes.c_int,
+            ctypes.c_void_p,
+        )(it.next_wrapper)
+        handle = ctypes.c_void_p()
+        ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
+            None,
+            it.proxy.handle,
+            reset_callback,
+            next_callback,
+            ctypes.c_float(self.missing),
+            ctypes.c_int(self.nthread),
+            ctypes.c_int(self.max_bin),
+            ctypes.byref(handle),
+        )
+        if it.exception:
+            raise it.exception
+        # delay check_call to throw intermediate exception first
+        _check_call(ret)
+        self.handle = handle
+
+
 Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
 Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]

@@ -1346,7 +1400,7 @@ class Booster(object):

    def boost(self, dtrain, grad, hess):
        """Boost the booster for one iteration, with customized gradient
-        statistics.  Like :func:`xgboost.core.Booster.update`, this
+        statistics.  Like :py:func:`xgboost.Booster.update`, this
        function should not be called directly by users.

        Parameters
@@ -1360,7 +1414,9 @@ class Booster(object):

        """
        if len(grad) != len(hess):
-            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
+            raise ValueError(
+                'grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))
+            )
        if not isinstance(dtrain, DMatrix):
            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
        self._validate_features(dtrain)
@@ -1453,17 +1509,12 @@ class Booster(object):
                training=False):
        """Predict with data.

-        .. note:: This function is not thread safe except for ``gbtree``
-                  booster.
+          .. note:: This function is not thread safe except for ``gbtree`` booster.

-          For ``gbtree`` booster, the thread safety is guaranteed by locks.
-          For lock free prediction use ``inplace_predict`` instead.  Also, the
-          safety does not hold when used in conjunction with other methods.
-
-          When using booster other than ``gbtree``, predict can only be called
-          from one thread.  If you want to run prediction using multiple
-          thread, call ``bst.copy()`` to make copies of model object and then
-          call ``predict()``.
+          When using booster other than ``gbtree``, predict can only be called from one
+          thread.  If you want to run prediction using multiple thread, call
+          :py:meth:`xgboost.Booster.copy` to make copies of model object and then call
+          ``predict()``.

        Parameters
        ----------
@@ -1579,9 +1630,17 @@ class Booster(object):
                preds = preds.reshape(nrow, chunk_size)
        return preds

-    def inplace_predict(self, data, iteration_range=(0, 0),
-                        predict_type='value', missing=np.nan):
-        '''Run prediction in-place, Unlike ``predict`` method, inplace prediction does
+    def inplace_predict(
+        self,
+        data,
+        iteration_range: Tuple[int, int] = (0, 0),
+        predict_type: str = "value",
+        missing: float = np.nan,
+        validate_features: bool = True,
+        base_margin: Any = None,
+        strict_shape: bool = False
+    ):
+        """Run prediction in-place, Unlike ``predict`` method, inplace prediction does
        not cache the prediction result.

        Calling only ``inplace_predict`` in multiple threads is safe and lock
@@ -1617,6 +1676,15 @@ class Booster(object):
        missing : float
            Value in the input data which needs to be present as a missing
            value.
+        validate_features:
+            See :py:meth:`xgboost.Booster.predict` for details.
+        base_margin:
+            See :py:obj:`xgboost.DMatrix` for details.
+        strict_shape:
+            When set to True, output shape is invariant to whether classification is used.
+            For both value and margin prediction, the output shape is (n_samples,
+            n_groups), n_groups == 1 when multi-class is not used.  Default to False, in
+            which case the output shape can be (n_samples, ) if multi-class is not used.

        Returns
        -------
@@ -1624,107 +1692,117 @@ class Booster(object):
            The prediction result.  When input data is on GPU, prediction
            result is stored in a cupy array.

-        '''
-
-        def reshape_output(predt, rows):
-            '''Reshape for multi-output prediction.'''
-            if predt.size != rows and predt.size % rows == 0:
-                cols = int(predt.size / rows)
-                predt = predt.reshape(rows, cols)
-                return predt
-            return predt
-
-        length = c_bst_ulong()
+        """
        preds = ctypes.POINTER(ctypes.c_float)()
-        iteration_range = (ctypes.c_uint(iteration_range[0]),
-                           ctypes.c_uint(iteration_range[1]))

        # once caching is supported, we can pass id(data) as cache id.
        try:
            import pandas as pd
+
            if isinstance(data, pd.DataFrame):
                data = data.values
        except ImportError:
            pass
+        args = {
+            "type": 0,
+            "training": False,
+            "iteration_begin": iteration_range[0],
+            "iteration_end": iteration_range[1],
+            "missing": missing,
+            "strict_shape": strict_shape,
+            "cache_id": 0,
+        }
+        if predict_type == "margin":
+            args["type"] = 1
+        shape = ctypes.POINTER(c_bst_ulong)()
+        dims = c_bst_ulong()
+
+        if base_margin is not None:
+            proxy = _ProxyDMatrix()
+            proxy.set_info(base_margin=base_margin)
+            p_handle = proxy.handle
+        else:
+            proxy = None
+            p_handle = ctypes.c_void_p()
+        assert proxy is None or isinstance(proxy, _ProxyDMatrix)
+        if validate_features:
+            if len(data.shape) != 1 and self.num_features() != data.shape[1]:
+                raise ValueError(
+                    f"Feature shape mismatch, expected: {self.num_features()}, "
+                    f"got {data.shape[0]}"
+                )
+
        if isinstance(data, np.ndarray):
-            assert data.flags.c_contiguous
-            arr = np.array(data.reshape(data.size), copy=False,
-                           dtype=np.float32)
-            _check_call(_LIB.XGBoosterPredictFromDense(
-                self.handle,
-                arr.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                c_bst_ulong(data.shape[0]),
-                c_bst_ulong(data.shape[1]),
-                ctypes.c_float(missing),
-                iteration_range[0],
-                iteration_range[1],
-                c_str(predict_type),
-                c_bst_ulong(0),
-                ctypes.byref(length),
-                ctypes.byref(preds)
-            ))
-            preds = ctypes2numpy(preds, length.value, np.float32)
-            rows = data.shape[0]
-            return reshape_output(preds, rows)
+            from .data import _maybe_np_slice
+            data = _maybe_np_slice(data, data.dtype)
+            _check_call(
+                _LIB.XGBoosterPredictFromDense(
+                    self.handle,
+                    _array_interface(data),
+                    from_pystr_to_cstr(json.dumps(args)),
+                    p_handle,
+                    ctypes.byref(shape),
+                    ctypes.byref(dims),
+                    ctypes.byref(preds),
+                )
+            )
+            return _prediction_output(shape, dims, preds, False)
        if isinstance(data, scipy.sparse.csr_matrix):
            csr = data
-            _check_call(_LIB.XGBoosterPredictFromCSR(
-                self.handle,
-                c_array(ctypes.c_size_t, csr.indptr),
-                c_array(ctypes.c_uint, csr.indices),
-                c_array(ctypes.c_float, csr.data),
-                ctypes.c_size_t(len(csr.indptr)),
-                ctypes.c_size_t(len(csr.data)),
-                ctypes.c_size_t(csr.shape[1]),
-                ctypes.c_float(missing),
-                iteration_range[0],
-                iteration_range[1],
-                c_str(predict_type),
-                c_bst_ulong(0),
-                ctypes.byref(length),
-                ctypes.byref(preds)))
-            preds = ctypes2numpy(preds, length.value, np.float32)
-            rows = data.shape[0]
-            return reshape_output(preds, rows)
-        if lazy_isinstance(data, 'cupy.core.core', 'ndarray'):
-            assert data.flags.c_contiguous
+            _check_call(
+                _LIB.XGBoosterPredictFromCSR(
+                    self.handle,
+                    _array_interface(csr.indptr),
+                    _array_interface(csr.indices),
+                    _array_interface(csr.data),
+                    ctypes.c_size_t(csr.shape[1]),
+                    from_pystr_to_cstr(json.dumps(args)),
+                    p_handle,
+                    ctypes.byref(shape),
+                    ctypes.byref(dims),
+                    ctypes.byref(preds),
+                )
+            )
+            return _prediction_output(shape, dims, preds, False)
+        if lazy_isinstance(data, "cupy.core.core", "ndarray"):
+            from .data import _transform_cupy_array
+            data = _transform_cupy_array(data)
            interface = data.__cuda_array_interface__
-            if 'mask' in interface:
-                interface['mask'] = interface['mask'].__cuda_array_interface__
-            interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
-            _check_call(_LIB.XGBoosterPredictFromArrayInterface(
-                self.handle,
-                interface_str,
-                ctypes.c_float(missing),
-                iteration_range[0],
-                iteration_range[1],
-                c_str(predict_type),
-                c_bst_ulong(0),
-                ctypes.byref(length),
-                ctypes.byref(preds)))
-            mem = ctypes2cupy(preds, length, np.float32)
-            rows = data.shape[0]
-            return reshape_output(mem, rows)
-        if lazy_isinstance(data, 'cudf.core.dataframe', 'DataFrame'):
+            if "mask" in interface:
+                interface["mask"] = interface["mask"].__cuda_array_interface__
+            interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
+            _check_call(
+                _LIB.XGBoosterPredictFromArrayInterface(
+                    self.handle,
+                    interface_str,
+                    from_pystr_to_cstr(json.dumps(args)),
+                    p_handle,
+                    ctypes.byref(shape),
+                    ctypes.byref(dims),
+                    ctypes.byref(preds),
+                )
+            )
+            return _prediction_output(shape, dims, preds, True)
+        if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
            from .data import _cudf_array_interfaces
-            interfaces_str = _cudf_array_interfaces(data)
-            _check_call(_LIB.XGBoosterPredictFromArrayInterfaceColumns(
-                self.handle,
-                interfaces_str,
-                ctypes.c_float(missing),
-                iteration_range[0],
-                iteration_range[1],
-                c_str(predict_type),
-                c_bst_ulong(0),
-                ctypes.byref(length),
-                ctypes.byref(preds)))
-            mem = ctypes2cupy(preds, length, np.float32)
-            rows = data.shape[0]
-            predt = reshape_output(mem, rows)
-            return predt

-        raise TypeError('Data type:' + str(type(data)) +
-                        ' not supported by inplace prediction.')
+            interfaces_str = _cudf_array_interfaces(data)
+            _check_call(
+                _LIB.XGBoosterPredictFromArrayInterfaceColumns(
+                    self.handle,
+                    interfaces_str,
+                    from_pystr_to_cstr(json.dumps(args)),
+                    p_handle,
+                    ctypes.byref(shape),
+                    ctypes.byref(dims),
+                    ctypes.byref(preds),
+                )
+            )
+            return _prediction_output(shape, dims, preds, True)
+
+        raise TypeError(
+            "Data type:" + str(type(data)) + " not supported by inplace prediction."
+        )

    def save_model(self, fname):
        """Save the model to a file.
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -187,8 +187,8 @@ class DaskDMatrix:
    `DaskDMatrix` forces all lazy computation to be carried out.  Wait for the input data
    explicitly if you want to see actual computation of constructing `DaskDMatrix`.

-    See doc string for DMatrix constructor for other parameters.  DaskDMatrix accepts only
-    dask collection.
+    See doc for :py:obj:`xgboost.DMatrix` constructor for other parameters.  DaskDMatrix
+    accepts only dask collection.

    .. note::

@@ -575,7 +575,8 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
    memory usage by eliminating data copies.  Internally the all partitions/chunks of data
    are merged by weighted GK sketching.  So the number of partitions from dask may affect
    training accuracy as GK generates bounded error for each merge.  See doc string for
-    `DeviceQuantileDMatrix` and `DMatrix` for other parameters.
+    :py:obj:`xgboost.DeviceQuantileDMatrix` and :py:obj:`xgboost.DMatrix` for other
+    parameters.

    .. versionadded:: 1.2.0

--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,11 +5,12 @@ import ctypes
 import json
 import warnings
 import os
+from typing import Any

 import numpy as np

 from .core import c_array, _LIB, _check_call, c_str
-from .core import DataIter, DeviceQuantileDMatrix, DMatrix
+from .core import DataIter, _ProxyDMatrix, DMatrix
 from .compat import lazy_isinstance

 c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
@@ -113,7 +114,7 @@ def _maybe_np_slice(data, dtype):
    return data


-def _transform_np_array(data: np.ndarray):
+def _transform_np_array(data: np.ndarray) -> np.ndarray:
    if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
        data = np.array(data, copy=False)
    if len(data.shape) != 2:
@@ -142,7 +143,7 @@ def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
    input layout and type if memory use is a concern.

    """
-    flatten = _transform_np_array(data)
+    flatten: np.ndarray = _transform_np_array(data)
    handle = ctypes.c_void_p()
    _check_call(_LIB.XGDMatrixCreateFromMat_omp(
        flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
@@ -783,54 +784,6 @@ class SingleBatchInternalIter(DataIter):  # pylint: disable=R0902
        self.it = 0


-def init_device_quantile_dmatrix(
-        data, missing, max_bin, threads, feature_names, feature_types, **meta
-):
-    '''Constructor for DeviceQuantileDMatrix.'''
-    if not any([_is_cudf_df(data), _is_cudf_ser(data), _is_cupy_array(data),
-                _is_dlpack(data), _is_iter(data)]):
-        raise TypeError(str(type(data)) +
-                        ' is not supported for DeviceQuantileDMatrix')
-    if _is_dlpack(data):
-        # We specialize for dlpack because cupy will take the memory from it so
-        # it can't be transformed twice.
-        data = _transform_dlpack(data)
-    if _is_iter(data):
-        it = data
-    else:
-        it = SingleBatchInternalIter(
-            data, **meta, feature_names=feature_names,
-            feature_types=feature_types)
-
-    reset_factory = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
-    reset_callback = reset_factory(it.reset_wrapper)
-    next_factory = ctypes.CFUNCTYPE(
-        ctypes.c_int,
-        ctypes.c_void_p,
-    )
-    next_callback = next_factory(it.next_wrapper)
-    handle = ctypes.c_void_p()
-    ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
-        None,
-        it.proxy.handle,
-        reset_callback,
-        next_callback,
-        ctypes.c_float(missing),
-        ctypes.c_int(threads),
-        ctypes.c_int(max_bin),
-        ctypes.byref(handle)
-    )
-    if it.exception:
-        raise it.exception
-    # delay check_call to throw intermediate exception first
-    _check_call(ret)
-    matrix = DeviceQuantileDMatrix(handle)
-    feature_names = matrix.feature_names
-    feature_types = matrix.feature_types
-    matrix.handle = None
-    return handle, feature_names, feature_types
-
-
 def _device_quantile_transform(data, feature_names, feature_types):
    if _is_cudf_df(data):
        return _transform_cudf_df(data, feature_names, feature_types)
@@ -845,7 +798,7 @@ def _device_quantile_transform(data, feature_names, feature_types):
                    str(type(data)))


-def dispatch_device_quantile_dmatrix_set_data(proxy, data):
+def dispatch_device_quantile_dmatrix_set_data(proxy: _ProxyDMatrix, data: Any) -> None:
    '''Dispatch for DeviceQuantileDMatrix.'''
    if _is_cudf_df(data):
        proxy._set_data_from_cuda_columnar(data)  # pylint: disable=W0212