Accept iterator in device dmatrix. (#5783)

* Remove Device DMatrix.
2020-07-07 21:44:48 +08:00
parent 048d969be4
commit a3ec964346
12 changed files with 495 additions and 343 deletions
--- a/demo/guide-python/data_iterator.py
+++ b/demo/guide-python/data_iterator.py
@@ -0,0 +1,109 @@
 '''A demo for defining data iterator.
 The demo that defines a customized iterator for passing batches of data into
 `xgboost.DeviceQuantileDMatrix` and use this `DeviceQuantileDMatrix` for
 training.  The feature is used primarily designed to reduce the required GPU
 memory for training on distributed environment.
 Aftering going through the demo, one might ask why don't we use more native
 Python iterator?  That's because XGBoost requires a `reset` function, while
 using `itertools.tee` might incur significant memory usage according to:
  https://docs.python.org/3/library/itertools.html#itertools.tee.
 '''
 import xgboost
 import cupy
 import numpy
 COLS = 64
 ROWS_PER_BATCH = 1000            # data is splited by rows
 BATCHES = 32
 class IterForDMatrixDemo(xgboost.core.DataIter):
    '''A data iterator for XGBoost DMatrix.
    `reset` and `next` are required for any data iterator, other functions here
    are utilites for demonstration's purpose.
    '''
    def __init__(self):
        '''Generate some random data for demostration.
        Actual data can be anything that is currently supported by XGBoost.
        '''
        self.rows = ROWS_PER_BATCH
        self.cols = COLS
        rng = cupy.random.RandomState(1994)
        self._data = [rng.randn(self.rows, self.cols)] * BATCHES
        self._labels = [rng.randn(self.rows)] * BATCHES
        self._weights = [rng.randn(self.rows)] * BATCHES
        self.it = 0             # set iterator to 0
        super().__init__()
    def as_array(self):
        return cupy.concatenate(self._data)
    def as_array_labels(self):
        return cupy.concatenate(self._labels)
    def as_array_weights(self):
        return cupy.concatenate(self._weights)
    def data(self):
        '''Utility function for obtaining current batch of data.'''
        return self._data[self.it]
    def labels(self):
        '''Utility function for obtaining current batch of label.'''
        return self._labels[self.it]
    def weights(self):
        return self._weights[self.it]
    def reset(self):
        '''Reset the iterator'''
        self.it = 0
    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == len(self._data):
            # Return 0 when there's no more batch.
            return 0
        input_data(data=self.data(), label=self.labels(),
                   weight=self.weights())
        self.it += 1
        return 1
 def main():
    rounds = 100
    it = IterForDMatrixDemo()
    # Use iterator, must be `DeviceQuantileDMatrix`
    m_with_it = xgboost.DeviceQuantileDMatrix(it)
    # Use regular DMatrix.
    m = xgboost.DMatrix(it.as_array(), it.as_array_labels(),
                        weight=it.as_array_weights())
    assert m_with_it.num_col() == m.num_col()
    assert m_with_it.num_row() == m.num_row()
    reg_with_it = xgboost.train({'tree_method': 'gpu_hist'}, m_with_it,
                                num_boost_round=rounds)
    predict_with_it = reg_with_it.predict(m_with_it)
    reg = xgboost.train({'tree_method': 'gpu_hist'}, m,
                        num_boost_round=rounds)
    predict = reg.predict(m)
    numpy.testing.assert_allclose(predict_with_it, predict,
                                  rtol=1e6)
 if __name__ == '__main__':
    main()
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -300,6 +300,99 @@ def _cudf_array_interfaces(df):
    return interfaces_str
 class DataIter:
    '''The interface for user defined data iterator. Currently is only
    supported by Device DMatrix.
    Parameters
    ----------
    rows : int
        Total number of rows combining all batches.
    cols : int
        Number of columns for each batch.
    '''
    def __init__(self):
        proxy_handle = ctypes.c_void_p()
        _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(proxy_handle)))
        self._handle = DeviceQuantileDMatrix(proxy_handle)
        self.exception = None
    @property
    def proxy(self):
        '''Handler of DMatrix proxy.'''
        return self._handle
    def reset_wrapper(self, this):  # pylint: disable=unused-argument
        '''A wrapper for user defined `reset` function.'''
        self.reset()
    def next_wrapper(self, this):  # pylint: disable=unused-argument
        '''A wrapper for user defined `next` function.
        `this` is not used in Python.  ctypes can handle `self` of a Python
        member function automatically when converting a it to c function
        pointer.
        '''
        if self.exception is not None:
            return 0
        def data_handle(data, label=None, weight=None, base_margin=None,
                        group=None,
                        label_lower_bound=None, label_upper_bound=None):
            if lazy_isinstance(data, 'cudf.core.dataframe', 'DataFrame'):
                # pylint: disable=protected-access
                self.proxy._set_data_from_cuda_columnar(data)
            elif lazy_isinstance(data, 'cudf.core.series', 'Series'):
                # pylint: disable=protected-access
                self.proxy._set_data_from_cuda_columnar(data)
            elif lazy_isinstance(data, 'cupy.core.core', 'ndarray'):
                # pylint: disable=protected-access
                self.proxy._set_data_from_cuda_interface(data)
            else:
                raise TypeError(
                    'Value type is not supported for data iterator:' +
                    str(type(self._handle)), type(data))
            self.proxy.set_info(label=label, weight=weight,
                                base_margin=base_margin,
                                group=group,
                                label_lower_bound=label_lower_bound,
                                label_upper_bound=label_upper_bound)
        try:
            # Deffer the exception in order to return 0 and stop the iteration.
            # Exception inside a ctype callback function has no effect except
            # for printing to stderr (doesn't stop the execution).
            ret = self.next(data_handle)  # pylint: disable=not-callable
        except Exception as e:            # pylint: disable=broad-except
            tb = sys.exc_info()[2]
            print('Got an exception in Python')
            self.exception = e.with_traceback(tb)
            return 0
        return ret
    def reset(self):
        '''Reset the data iterator.  Prototype for user defined function.'''
        raise NotImplementedError()
    def next(self, input_data):
        '''Set the next batch of data.
        Parameters
        ----------
        data_handle: callable
            A function with same data fields like `data`, `label` with
            `xgboost.DMatrix`.
        Returns
        -------
        0 if there's no more batch, otherwise 1.
        '''
        raise NotImplementedError()
 class DMatrix:                  # pylint: disable=too-many-instance-attributes
    """Data Matrix used in XGBoost.
@@ -361,36 +454,65 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
            self.handle = None
            return
-        handler = self.get_data_handler(data)
+        handler = self._get_data_handler(data)
        can_handle_meta = False
        if handler is None:
            data = _convert_unknown_data(data, None)
-            handler = self.get_data_handler(data)
+            handler = self._get_data_handler(data)
        try:
            handler.handle_meta(label, weight, base_margin)
            can_handle_meta = True
        except NotImplementedError:
            can_handle_meta = False
        self.handle, feature_names, feature_types = handler.handle_input(
            data, feature_names, feature_types)
        assert self.handle, 'Failed to construct a DMatrix.'
-        if label is not None:
+        if not can_handle_meta:
-            self.set_label(label)
+            self.set_info(label, weight, base_margin)
        if weight is not None:
            self.set_weight(weight)
        if base_margin is not None:
            self.set_base_margin(base_margin)
        self.feature_names = feature_names
        self.feature_types = feature_types
-    def get_data_handler(self, data, meta=None, meta_type=None):
+    def _get_data_handler(self, data, meta=None, meta_type=None):
        '''Get data handler for this DMatrix class.'''
        from .data import get_dmatrix_data_handler
        handler = get_dmatrix_data_handler(
            data, self.missing, self.nthread, self.silent, meta, meta_type)
        return handler
    # pylint: disable=no-self-use
    def _get_meta_handler(self, data, meta, meta_type):
        from .data import get_dmatrix_meta_handler
        handler = get_dmatrix_meta_handler(
            data, meta, meta_type)
        return handler
    def __del__(self):
        if hasattr(self, "handle") and self.handle:
            _check_call(_LIB.XGDMatrixFree(self.handle))
            self.handle = None
    def set_info(self,
                 label=None, weight=None, base_margin=None,
                 group=None,
                 label_lower_bound=None,
                 label_upper_bound=None):
        '''Set meta info for DMatrix.'''
        if label is not None:
            self.set_label(label)
        if weight is not None:
            self.set_weight(weight)
        if base_margin is not None:
            self.set_base_margin(base_margin)
        if group is not None:
            self.set_group(group)
        if label_lower_bound is not None:
            self.set_float_info('label_lower_bound', label_lower_bound)
        if label_upper_bound is not None:
            self.set_float_info('label_upper_bound', label_upper_bound)
    def get_float_info(self, field):
        """Get float property from the DMatrix.
@@ -447,10 +569,8 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
        if isinstance(data, np.ndarray):
            self.set_float_info_npy2d(field, data)
            return
-        handler = self.get_data_handler(data, field, np.float32)
+        handler = self._get_data_handler(data, field, np.float32)
-        if handler is None:
+        assert handler
            data = _convert_unknown_data(data, field, np.float32)
            handler = self.get_data_handler(data, field, np.float32)
        data, _, _ = handler.transform(data)
        c_data = c_array(ctypes.c_float, data)
        _check_call(_LIB.XGDMatrixSetFloatInfo(self.handle,
@@ -470,7 +590,7 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
        data: numpy array
            The array of data to be set
        """
-        data, _, _ = self.get_data_handler(
+        data, _, _ = self._get_meta_handler(
            data, field, np.float32).transform(data)
        c_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
        _check_call(_LIB.XGDMatrixSetFloatInfo(self.handle,
@@ -489,7 +609,7 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
        data: numpy array
            The array of data to be set
        """
-        data, _, _ = self.get_data_handler(
+        data, _, _ = self._get_data_handler(
            data, field, 'uint32').transform(data)
        _check_call(_LIB.XGDMatrixSetUIntInfo(self.handle,
                                              c_str(field),
@@ -803,11 +923,11 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
 class DeviceQuantileDMatrix(DMatrix):
-    """Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do not
+    """Device memory Data Matrix used in XGBoost for training with
-    use this for test/validation tasks as some information may be lost in quantisation. This
+    tree_method='gpu_hist'. Do not use this for test/validation tasks as some
-    DMatrix is primarily designed to save memory in training from device memory inputs by
+    information may be lost in quantisation. This DMatrix is primarily designed
-    avoiding intermediate storage. Implementation does not currently consider weights in
+    to save memory in training from device memory inputs by avoiding
-    quantisation process(unlike DMatrix). Set max_bin to control the number of bins during
+    intermediate storage. Set max_bin to control the number of bins during
    quantisation.
    You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
@@ -823,6 +943,9 @@ class DeviceQuantileDMatrix(DMatrix):
                 feature_types=None,
                 nthread=None, max_bin=256):
        self.max_bin = max_bin
        if isinstance(data, ctypes.c_void_p):
            self.handle = data
            return
        super().__init__(data, label=label, weight=weight,
                         base_margin=base_margin,
                         missing=missing,
@@ -831,11 +954,32 @@ class DeviceQuantileDMatrix(DMatrix):
                         feature_types=feature_types,
                         nthread=nthread)
-    def get_data_handler(self, data, meta=None, meta_type=None):
+    def _get_data_handler(self, data, meta=None, meta_type=None):
        from .data import get_device_quantile_dmatrix_data_handler
        return get_device_quantile_dmatrix_data_handler(
            data, self.max_bin, self.missing, self.nthread, self.silent)
    def _set_data_from_cuda_interface(self, data):
        '''Set data from CUDA array interface.'''
        interface = data.__cuda_array_interface__
        interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
        _check_call(
            _LIB.XGDeviceQuantileDMatrixSetDataCudaArrayInterface(
                self.handle,
                interface_str
            )
        )
    def _set_data_from_cuda_columnar(self, data):
        '''Set data from CUDA columnar format.1'''
        interfaces_str = _cudf_array_interfaces(data)
        _check_call(
            _LIB.XGDeviceQuantileDMatrixSetDataCudaColumnar(
                self.handle,
                interfaces_str
            )
        )
 class Booster(object):
    # pylint: disable=too-many-public-methods
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, no-self-use
+# pylint: disable=too-many-arguments, no-self-use, too-many-instance-attributes
 '''Data dispatching for DMatrix.'''
 import ctypes
 import abc
@@ -8,6 +8,7 @@ import warnings
 import numpy as np
 from .core import c_array, _LIB, _check_call, c_str, _cudf_array_interfaces
 from .core import DataIter
 from .compat import lazy_isinstance, STRING_TYPES, os_fspath, os_PathLike
 c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
@@ -23,6 +24,18 @@ class DataHandler(abc.ABC):
        self.meta = meta
        self.meta_type = meta_type
    def handle_meta(self, label=None, weight=None, base_margin=None,
                    group=None,
                    label_lower_bound=None,
                    label_upper_bound=None):
        '''Handle meta data when the DMatrix type can not defer setting meta
        data after construction.  Example is `DeviceQuantileDMatrix`
        which requires weight to be presented before digesting
        data.
        '''
        raise NotImplementedError()
    def _warn_unused_missing(self, data):
        if not (np.isnan(np.nan) or None):
            warnings.warn(
@@ -116,6 +129,14 @@ def get_dmatrix_data_handler(data, missing, nthread, silent,
    return handler(missing, nthread, silent, meta, meta_type)
 def get_dmatrix_meta_handler(data, meta, meta_type):
    '''Get handler for meta instead of data.'''
    handler = __dmatrix_registry.get_handler(data)
    if handler is None:
        return None
    return handler(None, 0, True, meta, meta_type)
 class FileHandler(DataHandler):
    '''Handler of path like input.'''
    def handle_input(self, data, feature_names, feature_types):
@@ -511,6 +532,43 @@ __dmatrix_registry.register_handler_opaque(
    DLPackHandler)
 class SingleBatchInternalIter(DataIter):
    '''An iterator for single batch data to help creating device DMatrix.
    Transforming input directly to histogram with normal single batch data API
    can not access weight for sketching.  So this iterator acts as a staging
    area for meta info.
    '''
    def __init__(self, data, label, weight, base_margin, group,
                 label_lower_bound, label_upper_bound):
        self.data = data
        self.label = label
        self.weight = weight
        self.base_margin = base_margin
        self.group = group
        self.label_lower_bound = label_lower_bound
        self.label_upper_bound = label_upper_bound
        self.it = 0             # pylint: disable=invalid-name
        super().__init__()
    def next(self, input_data):
        if self.it == 1:
            return 0
        self.it += 1
        input_data(data=self.data, label=self.label,
                   weight=self.weight, base_margin=self.base_margin,
                   group=self.group,
                   label_lower_bound=self.label_lower_bound,
                   label_upper_bound=self.label_upper_bound)
        return 1
    def reset(self):
        self.it = 0
 __device_quantile_dmatrix_registry = DMatrixDataManager()  # pylint: disable=invalid-name
 class DeviceQuantileDMatrixDataHandler(DataHandler):  # pylint: disable=abstract-method
    '''Base class of data handler for `DeviceQuantileDMatrix`.'''
    def __init__(self, max_bin, missing, nthread, silent,
@@ -518,8 +576,53 @@ class DeviceQuantileDMatrixDataHandler(DataHandler):  # pylint: disable=abstract
        self.max_bin = max_bin
        super().__init__(missing, nthread, silent, meta, meta_type)
    def handle_meta(self, label=None, weight=None, base_margin=None,
                    group=None,
                    label_lower_bound=None,
                    label_upper_bound=None):
        self.label = label
        self.weight = weight
        self.base_margin = base_margin
        self.group = group
        self.label_lower_bound = label_lower_bound
        self.label_upper_bound = label_upper_bound
-__device_quantile_dmatrix_registry = DMatrixDataManager()  # pylint: disable=invalid-name
+    def handle_input(self, data, feature_names, feature_types):
        if not isinstance(data, DataIter):
            it = SingleBatchInternalIter(
                data, self.label, self.weight,
                self.base_margin, self.group,
                self.label_lower_bound, self.label_upper_bound)
        else:
            it = data
        reset_factory = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
        reset_callback = reset_factory(it.reset_wrapper)
        next_factory = ctypes.CFUNCTYPE(
            ctypes.c_int,
            ctypes.c_void_p,
        )
        next_callback = next_factory(it.next_wrapper)
        handle = ctypes.c_void_p()
        ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
            None,
            it.proxy.handle,
            reset_callback,
            next_callback,
            ctypes.c_float(self.missing),
            ctypes.c_int(self.nthread),
            ctypes.c_int(self.max_bin),
            ctypes.byref(handle)
        )
        if it.exception:
            raise it.exception
        # delay check_call to throw intermediate exception first
        _check_call(ret)
        return handle, feature_names, feature_types
 __device_quantile_dmatrix_registry.register_handler_opaque(
    lambda x: isinstance(x, DataIter),
    DeviceQuantileDMatrixDataHandler)
 def get_device_quantile_dmatrix_data_handler(
@@ -549,19 +652,7 @@ class DeviceQuantileCudaArrayInterfaceHandler(
                data, '__array__'):
            import cupy         # pylint: disable=import-error
            data = cupy.array(data, copy=False)
-
+        return super().handle_input(data, feature_names, feature_types)
        interface = data.__cuda_array_interface__
        if 'mask' in interface:
            interface['mask'] = interface['mask'].__cuda_array_interface__
        interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
        handle = ctypes.c_void_p()
        _check_call(
            _LIB.XGDeviceQuantileDMatrixCreateFromArrayInterface(
                interface_str,
                ctypes.c_float(self.missing), ctypes.c_int(self.nthread),
                ctypes.c_int(self.max_bin), ctypes.byref(handle)))
        return handle, feature_names, feature_types
 __device_quantile_dmatrix_registry.register_handler(
@@ -582,14 +673,7 @@ class DeviceQuantileCudaColumnarHandler(DeviceQuantileDMatrixDataHandler,
        """Initialize Quantile Device DMatrix from columnar memory format."""
        data, feature_names, feature_types = self._maybe_cudf_dataframe(
            data, feature_names, feature_types)
-        interfaces_str = _cudf_array_interfaces(data)
+        return super().handle_input(data, feature_names, feature_types)
        handle = ctypes.c_void_p()
        _check_call(
            _LIB.XGDeviceQuantileDMatrixCreateFromArrayInterfaceColumns(
                interfaces_str,
                ctypes.c_float(self.missing), ctypes.c_int(self.nthread),
                ctypes.c_int(self.max_bin), ctypes.byref(handle)))
        return handle, feature_names, feature_types
 __device_quantile_dmatrix_registry.register_handler(
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -4,7 +4,6 @@
 #include "xgboost/learner.h"
 #include "c_api_error.h"
 #include "../data/device_adapter.cuh"
 #include "../data/device_dmatrix.h"
 using namespace xgboost;  // NOLINT
@@ -31,28 +30,6 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
  API_END();
 }
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
  bst_float missing, int nthread, int max_bin,
  DMatrixHandle* out) {
  API_BEGIN();
  std::string json_str{c_json_strs};
  data::CudfAdapter adapter(json_str);
  *out =
    new std::shared_ptr<DMatrix>(new data::DeviceDMatrix(&adapter, missing, nthread, max_bin));
  API_END();
 }
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromArrayInterface(char const* c_json_strs,
  bst_float missing, int nthread, int max_bin,
  DMatrixHandle* out) {
  API_BEGIN();
  std::string json_str{c_json_strs};
  data::CupyAdapter adapter(json_str);
  *out =
    new std::shared_ptr<DMatrix>(new data::DeviceDMatrix(&adapter, missing, nthread, max_bin));
  API_END();
 }
 // A hidden API as cache id is not being supported yet.
 XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(BoosterHandle handle,
                                                      char const* c_json_strs,
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -201,7 +201,7 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 // Returns maximum row length
 template <typename AdapterBatchT>
-size_t GetRowCounts(const AdapterBatchT& batch, common::Span<size_t> offset,
+size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
                    int device_idx, float missing) {
  IsValidFunctor is_valid(missing);
  // Count elements per row
--- a/src/data/device_dmatrix.cu
+++ b/src/data/device_dmatrix.cu
@@ -1,58 +0,0 @@
 /*!
 * Copyright 2020 by Contributors
 * \file device_dmatrix.cu
 * \brief Device-memory version of DMatrix.
 */
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <memory>
 #include <utility>
 #include "../common/hist_util.h"
 #include "adapter.h"
 #include "device_adapter.cuh"
 #include "ellpack_page.cuh"
 #include "device_dmatrix.h"
 namespace xgboost {
 namespace data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
 template <typename AdapterT>
 DeviceDMatrix::DeviceDMatrix(AdapterT* adapter, float missing, int nthread, int max_bin) {
  dh::safe_cuda(cudaSetDevice(adapter->DeviceIdx()));
  auto& batch = adapter->Value();
  // Work out how many valid entries we have in each row
  dh::caching_device_vector<size_t> row_counts(adapter->NumRows() + 1, 0);
  common::Span<size_t> row_counts_span(row_counts.data().get(),
                                       row_counts.size());
  size_t row_stride =
      GetRowCounts(batch, row_counts_span, adapter->DeviceIdx(), missing);
  dh::XGBCachingDeviceAllocator<char> alloc;
  info_.num_nonzero_ = thrust::reduce(thrust::cuda::par(alloc),
                                      row_counts.begin(), row_counts.end());
  info_.num_col_ = adapter->NumColumns();
  info_.num_row_ = adapter->NumRows();
  ellpack_page_.reset(new EllpackPage());
  *ellpack_page_->Impl() =
      EllpackPageImpl(adapter, missing, this->IsDense(), nthread, max_bin,
                      row_counts_span, row_stride);
  // Synchronise worker columns
  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
 }
 #define DEVICE_DMARIX_SPECIALIZATION(__ADAPTER_T)                       \
  template DeviceDMatrix::DeviceDMatrix(__ADAPTER_T* adapter, float missing, \
                                        int nthread, int max_bin);
 DEVICE_DMARIX_SPECIALIZATION(CudfAdapter);
 DEVICE_DMARIX_SPECIALIZATION(CupyAdapter);
 }  // namespace data
 }  // namespace xgboost
--- a/src/data/device_dmatrix.h
+++ b/src/data/device_dmatrix.h
@@ -1,64 +0,0 @@
 /*!
 * Copyright 2020 by Contributors
 * \file device_dmatrix.h
 * \brief Device-memory version of DMatrix.
 */
 #ifndef XGBOOST_DATA_DEVICE_DMATRIX_H_
 #define XGBOOST_DATA_DEVICE_DMATRIX_H_
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <memory>
 #include "adapter.h"
 #include "simple_batch_iterator.h"
 #include "simple_dmatrix.h"
 namespace xgboost {
 namespace data {
 class DeviceDMatrix : public DMatrix {
 public:
  template <typename AdapterT>
  explicit DeviceDMatrix(AdapterT* adapter, float missing, int nthread, int max_bin);
  MetaInfo& Info() override { return info_; }
  const MetaInfo& Info() const override { return info_; }
  bool SingleColBlock() const override { return true; }
  bool EllpackExists() const override { return true; }
  bool SparsePageExists() const override { return false; }
  DMatrix *Slice(common::Span<int32_t const> ridxs) override {
    LOG(FATAL) << "Slicing DMatrix is not supported for Device DMatrix.";
    return nullptr;
  }
 private:
  BatchSet<SparsePage> GetRowBatches() override {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
  }
  BatchSet<CSCPage> GetColumnBatches() override {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<CSCPage>(BatchIterator<CSCPage>(nullptr));
  }
  BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(nullptr));
  }
  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override {
    auto begin_iter = BatchIterator<EllpackPage>(
        new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_.get()));
    return BatchSet<EllpackPage>(begin_iter);
  }
  MetaInfo info_;
  // source data pointer.
  std::unique_ptr<EllpackPage> ellpack_page_;
 };
 }  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_DEVICE_DMATRIX_H_
--- a/src/data/iterative_device_dmatrix.cu
+++ b/src/data/iterative_device_dmatrix.cu
@@ -93,6 +93,11 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
    batches++;
  }
  if (device < 0) {  // error or empty
    this->page_.reset(new EllpackPage);
    return;
  }
  common::SketchContainer final_sketch(batch_param_.max_bin, cols, accumulated_rows, device);
  for (auto const& sketch : sketch_containers) {
    final_sketch.Merge(sketch.ColumnsPtr(), sketch.Data());
@@ -108,14 +113,23 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
  this->info_.num_row_ = accumulated_rows;
  this->info_.num_nonzero_ = nnz;
-  // Construct the final ellpack page.
+  auto init_page = [this, &proxy, &cuts, row_stride, accumulated_rows]() {
-  page_.reset(new EllpackPage);
+    if (!page_) {
-  *(page_->Impl()) = EllpackPageImpl(proxy->DeviceIdx(), cuts, this->IsDense(),
+      // Should be put inside the while loop to protect against empty batch.  In
-                                     row_stride, accumulated_rows);
+      // that case device id is invalid.
      page_.reset(new EllpackPage);
      *(page_->Impl()) =
          EllpackPageImpl(proxy->DeviceIdx(), cuts, this->IsDense(), row_stride,
                          accumulated_rows);
    }
  };
  // Construct the final ellpack page.
  size_t offset = 0;
  iter.Reset();
  size_t n_batches_for_verification = 0;
  while (iter.Next()) {
    init_page();
    auto device = proxy->DeviceIdx();
    dh::safe_cuda(cudaSetDevice(device));
    auto rows = num_rows();
@@ -138,7 +152,10 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
    if (batches != 1) {
      this->info_.Extend(std::move(proxy->Info()), false);
    }
    n_batches_for_verification++;
  }
  CHECK_EQ(batches, n_batches_for_verification)
      << "Different number of batches returned between 2 iterations";
  if (batches == 1) {
    this->info_ = std::move(proxy->Info());
--- a/tests/cpp/data/test_device_dmatrix.cu
+++ b/tests/cpp/data/test_device_dmatrix.cu
@@ -1,149 +0,0 @@
 // Copyright (c) 2019 by Contributors
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/device_dmatrix.h"
 #include "../helpers.h"
 #include <thrust/device_vector.h>
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/gbm/gbtree_model.h"
 #include "../common/test_hist_util.h"
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/common/math.h"
 #include "test_array_interface.h"
 using namespace xgboost;  // NOLINT
 TEST(DeviceDMatrix, RowMajor) {
  int num_rows = 1000;
  int num_columns = 50;
  auto x = common::GenerateRandom(num_rows, num_columns);
  auto x_device = thrust::device_vector<float>(x);
  auto adapter = common::AdapterFromData(x_device, num_rows, num_columns);
  data::DeviceDMatrix dmat(&adapter,
                           std::numeric_limits<float>::quiet_NaN(), 1, 256);
  auto &batch = *dmat.GetBatches<EllpackPage>({0, 256, 0}).begin();
  auto impl = batch.Impl();
  common::CompressedIterator<uint32_t> iterator(
      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
  for(auto i = 0ull; i < x.size(); i++)
  {
    int column_idx = i % num_columns;
    EXPECT_EQ(impl->Cuts().SearchBin(x[i], column_idx), iterator[i]);
  }
  EXPECT_EQ(dmat.Info().num_col_, num_columns);
  EXPECT_EQ(dmat.Info().num_row_, num_rows);
  EXPECT_EQ(dmat.Info().num_nonzero_, num_rows * num_columns);
 }
 TEST(DeviceDMatrix, RowMajorMissing) {
  const float kMissing = std::numeric_limits<float>::quiet_NaN();
  int num_rows = 10;
  int num_columns = 2;
  auto x = common::GenerateRandom(num_rows, num_columns);
  x[1] = kMissing;
  x[5] = kMissing;
  x[6] = kMissing;
  auto x_device = thrust::device_vector<float>(x);
  auto adapter = common::AdapterFromData(x_device, num_rows, num_columns);
  data::DeviceDMatrix dmat(&adapter, kMissing, 1, 256);
  auto &batch = *dmat.GetBatches<EllpackPage>({0, 256, 0}).begin();
  auto impl = batch.Impl();
  common::CompressedIterator<uint32_t> iterator(
      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
  // null values get placed after valid values in a row
  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
  EXPECT_EQ(dmat.Info().num_col_, num_columns);
  EXPECT_EQ(dmat.Info().num_row_, num_rows);
  EXPECT_EQ(dmat.Info().num_nonzero_, num_rows*num_columns-3);
 }
 TEST(DeviceDMatrix, ColumnMajor) {
  constexpr size_t kRows{100};
  std::vector<Json> columns;
  thrust::device_vector<double> d_data_0(kRows);
  thrust::device_vector<uint32_t> d_data_1(kRows);
  columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
  columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
  Json column_arr{columns};
  std::string str;
  Json::Dump(column_arr, &str);
  data::CudfAdapter adapter(str);
  data::DeviceDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
                           -1, 256);
  auto &batch = *dmat.GetBatches<EllpackPage>({0, 256, 0}).begin();
  auto impl = batch.Impl();
  common::CompressedIterator<uint32_t> iterator(
      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
  for (auto i = 0ull; i < kRows; i++) {
    for (auto j = 0ull; j < columns.size(); j++) {
      if (j == 0) {
        EXPECT_EQ(iterator[i * 2 + j], impl->Cuts().SearchBin(d_data_0[i], j));
      } else {
        EXPECT_EQ(iterator[i * 2 + j], impl->Cuts().SearchBin(d_data_1[i], j));
      }
    }
  }
  EXPECT_EQ(dmat.Info().num_col_, 2);
  EXPECT_EQ(dmat.Info().num_row_, kRows);
  EXPECT_EQ(dmat.Info().num_nonzero_, kRows*2);
 }
 // Test equivalence with simple DMatrix
 TEST(DeviceDMatrix, Equivalent) {
  int bin_sizes[] = {2, 16, 256, 512};
  int sizes[] = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = common::GenerateRandom(num_rows, num_columns);
    for (auto num_bins : bin_sizes) {
      auto dmat = common::GetDMatrixFromData(x, num_rows, num_columns);
      auto x_device = thrust::device_vector<float>(x);
      auto adapter = common::AdapterFromData(x_device, num_rows, num_columns);
      data::DeviceDMatrix device_dmat(
          &adapter, std::numeric_limits<float>::quiet_NaN(), 1, num_bins);
      const auto &batch = *dmat->GetBatches<EllpackPage>({0, num_bins}).begin();
      const auto &device_dmat_batch =
          *device_dmat.GetBatches<EllpackPage>({0, num_bins}).begin();
      ASSERT_EQ(batch.Impl()->Cuts().Values(), device_dmat_batch.Impl()->Cuts().Values());
      ASSERT_EQ(batch.Impl()->gidx_buffer.HostVector(),
                device_dmat_batch.Impl()->gidx_buffer.HostVector());
    }
  }
 }
 TEST(DeviceDMatrix, IsDense) {
  int num_bins = 16;
  auto test = [num_bins] (float sparsity) {
    HostDeviceVector<float> data;
    std::string interface_str = RandomDataGenerator{10, 10, sparsity}
      .Device(0).GenerateArrayInterface(&data);
    data::CupyAdapter x{interface_str};
    std::unique_ptr<data::DeviceDMatrix> device_dmat{ new data::DeviceDMatrix(
        &x, std::numeric_limits<float>::quiet_NaN(), 1, num_bins) };
    if (sparsity == 0.0) {
      ASSERT_TRUE(device_dmat->IsDense()) << sparsity;
    } else {
      ASSERT_FALSE(device_dmat->IsDense());
    }
  };
  test(0.0);
  test(0.1);
 }
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -54,6 +54,7 @@ void TestTrainingPrediction(size_t rows, size_t bins,
    learner->SetParam("objective", "multi:softprob");
    learner->SetParam("num_feature", std::to_string(kCols));
    learner->SetParam("num_class", std::to_string(kClasses));
    learner->SetParam("max_bin", std::to_string(bins));
    learner->Configure();
    for (size_t i = 0; i < kIters; ++i) {
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -170,3 +170,83 @@ Arrow specification.'''
    @pytest.mark.skipif(**tm.no_cudf())
    def test_cudf_metainfo_device_dmatrix(self):
        _test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
 class IterForDMatrixTest(xgb.core.DataIter):
    '''A data iterator for XGBoost DMatrix.
    `reset` and `next` are required for any data iterator, other functions here
    are utilites for demonstration's purpose.
    '''
    ROWS_PER_BATCH = 100            # data is splited by rows
    BATCHES = 16
    def __init__(self):
        '''Generate some random data for demostration.
        Actual data can be anything that is currently supported by XGBoost.
        '''
        import cudf
        self.rows = self.ROWS_PER_BATCH
        rng = np.random.RandomState(1994)
        self._data = [
            cudf.DataFrame(
                {'a': rng.randn(self.ROWS_PER_BATCH),
                 'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
        self._labels = [rng.randn(self.rows)] * self.BATCHES
        self.it = 0             # set iterator to 0
        super().__init__()
    def as_array(self):
        import cudf
        return cudf.concat(self._data)
    def as_array_labels(self):
        return np.concatenate(self._labels)
    def data(self):
        '''Utility function for obtaining current batch of data.'''
        return self._data[self.it]
    def labels(self):
        '''Utility function for obtaining current batch of label.'''
        return self._labels[self.it]
    def reset(self):
        '''Reset the iterator'''
        self.it = 0
    def next(self, input_data):
        '''Yield next batch of data'''
        if self.it == len(self._data):
            # Return 0 when there's no more batch.
            return 0
        input_data(data=self.data(), label=self.labels())
        self.it += 1
        return 1
@pytest.mark.skipif(**tm.no_cudf())
 def test_from_cudf_iter():
    rounds = 100
    it = IterForDMatrixTest()
    # Use iterator
    m_it = xgb.DeviceQuantileDMatrix(it)
    reg_with_it = xgb.train({'tree_method': 'gpu_hist'}, m_it,
                            num_boost_round=rounds)
    predict_with_it = reg_with_it.predict(m_it)
    # Without using iterator
    m = xgb.DMatrix(it.as_array(), it.as_array_labels())
    assert m_it.num_col() == m.num_col()
    assert m_it.num_row() == m.num_row()
    reg = xgb.train({'tree_method': 'gpu_hist'}, m,
                    num_boost_round=rounds)
    predict = reg.predict(m)
    np.testing.assert_allclose(predict_with_it, predict)
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -0,0 +1,11 @@
 import os
 import subprocess
 import sys
 sys.path.append("tests/python")
 import test_demos as td         # noqa
 def test_data_iterator():
    script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py')
    cmd = ['python', script]
    subprocess.check_call(cmd)