Implement iterative DMatrix. (#5837)

2020-07-03 11:44:52 +08:00
parent 4d277d750d
commit 1a0801238e
15 changed files with 855 additions and 84 deletions
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -26,39 +26,10 @@
 // manually define unsigned long
 typedef uint64_t bst_ulong;  // NOLINT(*)
 /*! \brief handle to DMatrix */
 typedef void *DMatrixHandle;  // NOLINT(*)
 /*! \brief handle to Booster */
 typedef void *BoosterHandle;  // NOLINT(*)
 /*! \brief handle to a data iterator */
 typedef void *DataIterHandle;  // NOLINT(*)
 /*! \brief handle to a internal data holder. */
 typedef void *DataHolderHandle;  // NOLINT(*)
 /*! \brief Mini batch used in XGBoost Data Iteration */
 typedef struct {  // NOLINT(*)
  /*! \brief number of rows in the minibatch */
  size_t size;
  /* \brief number of columns in the minibatch. */
  size_t columns;
  /*! \brief row pointer to the rows in the data */
 #ifdef __APPLE__
  /* Necessary as Java on MacOS defines jlong as long int
   * and gcc defines int64_t as long long int. */
  long* offset; // NOLINT(*)
 #else
  int64_t* offset;  // NOLINT(*)
 #endif  // __APPLE__
  /*! \brief labels of each instance */
  float* label;
  /*! \brief weight of each instance, can be NULL */
  float* weight;
  /*! \brief feature index */
  int* index;
  /*! \brief feature values */
  float* value;
 } XGBoostBatchCSR;
 /*!
 * \brief Return the version of the XGBoost library being currently used.
@@ -71,29 +42,6 @@ typedef struct {  // NOLINT(*)
 */
 XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch);
 /*!
 * \brief Callback to set the data to handle,
 * \param handle The handle to the callback.
 * \param batch The data content to be set.
 */
 XGB_EXTERN_C typedef int XGBCallbackSetData(  // NOLINT(*)
    DataHolderHandle handle, XGBoostBatchCSR batch);
 /*!
 * \brief The data reading callback function.
 *  The iterator will be able to give subset of batch in the data.
 *
 *  If there is data, the function will call set_function to set the data.
 *
 * \param data_handle The handle to the callback.
 * \param set_function The batch returned by the iterator
 * \param set_function_handle The handle to be passed to set function.
 * \return 0 if we are reaching the end and batch is not returned.
 */
 XGB_EXTERN_C typedef int XGBCallbackDataIterNext(  // NOLINT(*)
    DataIterHandle data_handle, XGBCallbackSetData *set_function,
    DataHolderHandle set_function_handle);
 /*!
 * \brief get string message of the last error
 *
@@ -126,20 +74,6 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname,
                                    int silent,
                                    DMatrixHandle *out);
 /*!
 * \brief Create a DMatrix from a data iterator.
 * \param data_handle The handle to the data.
 * \param callback The callback to get the data.
 * \param cache_info Additional information about cache file, can be null.
 * \param out The created DMatrix
 * \return 0 when success, -1 when failure happens.
 */
 XGB_DLL int XGDMatrixCreateFromDataIter(
    DataIterHandle data_handle,
    XGBCallbackDataIterNext* callback,
    const char* cache_info,
    DMatrixHandle *out);
 /*!
 * \brief create a matrix content from CSR format
 * \param indptr pointer to row headers
@@ -221,6 +155,189 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
                                  bst_ulong ncol,
                                  DMatrixHandle* out,
                                  int nthread);
 /*
 * ========================== Begin data callback APIs =========================
 *
 * Short notes for data callback
 *
 * There are 2 sets of data callbacks for DMatrix.  The first one is currently exclusively
 * used by JVM packages.  It uses `XGBoostBatchCSR` to accept batches for CSR formated
 * input, and concatenate them into 1 final big CSR.  The related functions are:
 *
 * - XGBCallbackSetData
 * - XGBCallbackDataIterNext
 * - XGDMatrixCreateFromDataIter
 *
 * Another set is used by Quantile based DMatrix (used by hist algorithm) for reducing
 * memory usage.  Currently only GPU implementation is available.  It accept foreign data
 * iterators as callbacks and works similar to external memory.  For GPU Hist, the data is
 * first compressed by quantile sketching then merged.  This is particular useful for
 * distributed setting as it eliminates 2 copies of data.  1 by a `concat` from external
 * library to make the data into a blob for normal DMatrix initialization, another by the
 * internal CSR copy of DMatrix.  Related functions are:
 *
 * - XGProxyDMatrixCreate
 * - XGDMatrixCallbackNext
 * - DataIterResetCallback
 * - XGDeviceQuantileDMatrixSetDataCudaArrayInterface
 * - XGDeviceQuantileDMatrixSetDataCudaColumnar
 * - ... (data setters)
 */
 /*  ==== First set of callback functions, used exclusively by JVM packages. ==== */
 /*! \brief handle to a external data iterator */
 typedef void *DataIterHandle;  // NOLINT(*)
 /*! \brief handle to a internal data holder. */
 typedef void *DataHolderHandle;  // NOLINT(*)
 /*! \brief Mini batch used in XGBoost Data Iteration */
 typedef struct {  // NOLINT(*)
  /*! \brief number of rows in the minibatch */
  size_t size;
  /* \brief number of columns in the minibatch. */
  size_t columns;
  /*! \brief row pointer to the rows in the data */
 #ifdef __APPLE__
  /* Necessary as Java on MacOS defines jlong as long int
   * and gcc defines int64_t as long long int. */
  long* offset; // NOLINT(*)
 #else
  int64_t* offset;  // NOLINT(*)
 #endif  // __APPLE__
  /*! \brief labels of each instance */
  float* label;
  /*! \brief weight of each instance, can be NULL */
  float* weight;
  /*! \brief feature index */
  int* index;
  /*! \brief feature values */
  float* value;
 } XGBoostBatchCSR;
 /*!
 * \brief Callback to set the data to handle,
 * \param handle The handle to the callback.
 * \param batch The data content to be set.
 */
 XGB_EXTERN_C typedef int XGBCallbackSetData(  // NOLINT(*)
    DataHolderHandle handle, XGBoostBatchCSR batch);
 /*!
 * \brief The data reading callback function.
 *  The iterator will be able to give subset of batch in the data.
 *
 *  If there is data, the function will call set_function to set the data.
 *
 * \param data_handle The handle to the callback.
 * \param set_function The batch returned by the iterator
 * \param set_function_handle The handle to be passed to set function.
 * \return 0 if we are reaching the end and batch is not returned.
 */
 XGB_EXTERN_C typedef int XGBCallbackDataIterNext(  // NOLINT(*)
    DataIterHandle data_handle, XGBCallbackSetData *set_function,
    DataHolderHandle set_function_handle);
 /*!
 * \brief Create a DMatrix from a data iterator.
 * \param data_handle The handle to the data.
 * \param callback The callback to get the data.
 * \param cache_info Additional information about cache file, can be null.
 * \param out The created DMatrix
 * \return 0 when success, -1 when failure happens.
 */
 XGB_DLL int XGDMatrixCreateFromDataIter(
    DataIterHandle data_handle,
    XGBCallbackDataIterNext* callback,
    const char* cache_info,
    DMatrixHandle *out);
 /*  == Second set of callback functions, used by constructing Quantile based DMatrix. ===
 *
 * Short note for how to use the second set of callback for GPU Hist tree method.
 *
 * Step 0: Define a data iterator with 2 methods `reset`, and `next`.
 * Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
 * Step 2: Pass the iterator handle, proxy handle and 2 methods into
 *         `XGDeviceQuantileDMatrixCreateFromCallback`.
 * Step 3: Call appropriate data setters in `next` functions.
 *
 * See test_iterative_device_dmatrix.cu or Python interface for examples.
 */
 /*!
 * \brief Create a DMatrix proxy for setting data, can be free by XGDMatrixFree.
 *
 * \param out      The created Device Quantile DMatrix
 *
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out);
 /*!
 * \brief Callback function prototype for getting next batch of data.
 *
 * \param iter  A handler to the user defined iterator.
 *
 * \return 0 when success, -1 when failure happens
 */
 XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter);  // NOLINT(*)
 /*!
 * \brief Callback function prototype for reseting external iterator
 */
 XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*)
 /*!
 * \brief Create a device DMatrix with data iterator.
 *
 * \param iter     A handle to external data iterator.
 * \param proxy    A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
 * \param reset    Callback function reseting the iterator state.
 * \param next     Callback function yieling the next batch of data.
 * \param missing  Which value to represent missing value
 * \param nthread  Number of threads to use, 0 for default.
 * \param max_bin  Maximum number of bins for building histogram.
 * \param out      The created Device Quantile DMatrix
 *
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
    DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
    XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
    DMatrixHandle *out);
 /*!
 * \brief Set data on a DMatrix proxy.
 *
 * \param handle          A DMatrix proxy created by XGProxyDMatrixCreate
 * \param c_interface_str Null terminated JSON document string representation of CUDA
 *                        array interface.
 *
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGDeviceQuantileDMatrixSetDataCudaArrayInterface(
    DMatrixHandle handle,
    const char* c_interface_str);
 /*!
 * \brief Set data on a DMatrix proxy.
 *
 * \param handle          A DMatrix proxy created by XGProxyDMatrixCreate
 * \param c_interface_str Null terminated JSON document string representation of CUDA
 *                        array interface, with an array of columns.
 *
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGDeviceQuantileDMatrixSetDataCudaColumnar(
    DMatrixHandle handle,
    const char* c_interface_str);
 /*
 * ==========================- End data callback APIs ==========================
 */
 /*!
 * \brief create a new dmatrix from sliced content of existing matrix
 * \param handle instance of data matrix to be sliced
@@ -261,6 +378,18 @@ XGB_DLL int XGDMatrixFree(DMatrixHandle handle);
 */
 XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle,
                                const char *fname, int silent);
 /*!
 * \brief Set content in array interface to a content in info.
 * \param handle a instance of data matrix
 * \param field field name.
 * \param c_interface_str JSON string representation of array interface.
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle,
                                          char const* field,
                                          char const* c_interface_str);
 /*!
 * \brief set float vector to a content in info
 * \param handle a instance of data matrix
@@ -437,6 +566,10 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
                             int training,
                             bst_ulong *out_len,
                             const float **out_result);
 /*
 * ========================== Begin Serialization APIs =========================
 */
 /*
 * Short note for serialization APIs.  There are 3 different sets of serialization API.
 *
@@ -559,6 +692,10 @@ XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle, bst_ulong *out_len,
 */
 XGB_DLL int XGBoosterLoadJsonConfig(BoosterHandle handle,
                                    char const *json_parameters);
 /*
 * =========================== End Serialization APIs ==========================
 */
 /*!
 * \brief dump model, return array of strings representing model dump
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -502,7 +502,33 @@ class DMatrix {
                         const std::string& cache_prefix = "",
                         size_t page_size = kPageSize);
-  virtual DMatrix* Slice(common::Span<int32_t const> ridxs) = 0;
+  /**
   * \brief Create a new Quantile based DMatrix used for histogram based algorithm.
   *
   * \tparam DataIterHandle         External iterator type, defined in C API.
   * \tparam DMatrixHandle          DMatrix handle, defined in C API.
   * \tparam DataIterResetCallback  Callback for reset, prototype defined in C API.
   * \tparam XGDMatrixCallbackNext  Callback for next, prototype defined in C API.
   *
   * \param iter    External data iterator
   * \param proxy   A hanlde to ProxyDMatrix
   * \param reset   Callback for reset
   * \param next    Callback for next
   * \param missing Value that should be treated as missing.
   * \param nthread number of threads used for initialization.
   * \param max_bin Maximum number of bins.
   *
   * \return A created quantile based DMatrix.
   */
  template <typename DataIterHandle, typename DMatrixHandle,
            typename DataIterResetCallback, typename XGDMatrixCallbackNext>
  static DMatrix *Create(DataIterHandle iter, DMatrixHandle proxy,
                         DataIterResetCallback *reset,
                         XGDMatrixCallbackNext *next, float missing,
                         int nthread,
                         int max_bin);
      virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
  /*! \brief page size 32 MB */
  static const size_t kPageSize = 32UL << 20UL;
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -23,6 +23,7 @@
 #include "../common/io.h"
 #include "../data/adapter.h"
 #include "../data/simple_dmatrix.h"
 #include "../data/proxy_dmatrix.h"
 using namespace xgboost; // NOLINT(*);
@@ -101,6 +102,50 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
 #endif
 // Create from data iterator
 XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
  API_BEGIN();
  *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
  API_END();
 }
 XGB_DLL int
 XGDeviceQuantileDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
                                                 char const *c_interface_str) {
  API_BEGIN();
  CHECK_HANDLE();
  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
  CHECK(p_m);
  auto m =   static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
  CHECK(m) << "Current DMatrix type does not support set data.";
  m->SetData(c_interface_str);
  API_END();
 }
 XGB_DLL int
 XGDeviceQuantileDMatrixSetDataCudaColumnar(DMatrixHandle handle,
                                           char const *c_interface_str) {
  API_BEGIN();
  CHECK_HANDLE();
  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
  CHECK(p_m);
  auto m =   static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
  CHECK(m) << "Current DMatrix type does not support set data.";
  m->SetData(c_interface_str);
  API_END();
 }
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
    DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
    XGDMatrixCallbackNext *next, float missing, int nthread,
    int max_bin, DMatrixHandle *out) {
  API_BEGIN();
  *out = new std::shared_ptr<xgboost::DMatrix>{
    xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)};
  API_END();
 }
 // End Create from data iterator
 XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
                                     const unsigned* indices,
                                     const bst_float* data,
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -68,9 +68,11 @@ struct SketchContainer {
  // Prevent copying/assigning/moving this as its internals can't be
  // assigned/copied/moved
  SketchContainer(const SketchContainer&) = delete;
-  SketchContainer(const SketchContainer&&) = delete;
+  SketchContainer(SketchContainer&& that) {
    std::swap(sketches_, that.sketches_);
  }
  SketchContainer& operator=(const SketchContainer&) = delete;
-  SketchContainer& operator=(const SketchContainer&&) = delete;
+  SketchContainer& operator=(SketchContainer&&) = delete;
 };
 struct EntryCompareOp {
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -19,6 +19,7 @@
 #include "../common/version.h"
 #include "../common/group_data.h"
 #include "../data/adapter.h"
 #include "../data/iterative_device_dmatrix.h"
 #if DMLC_ENABLE_STD_THREAD
 #include "./sparse_page_source.h"
@@ -569,6 +570,26 @@ DMatrix* DMatrix::Load(const std::string& uri,
  }
  return dmat;
 }
 template <typename DataIterHandle, typename DMatrixHandle,
          typename DataIterResetCallback, typename XGDMatrixCallbackNext>
 DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
                         DataIterResetCallback *reset,
                         XGDMatrixCallbackNext *next, float missing,
                         int nthread,
                         int max_bin) {
 #if defined(XGBOOST_USE_CUDA)
  return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing, nthread, max_bin);
 #else
  common::AssertGPUSupport();
  return nullptr;
 #endif
 }
 template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
                                  DataIterResetCallback, XGDMatrixCallbackNext>(
    DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
    XGDMatrixCallbackNext *next, float missing, int nthread,
    int max_bin);
 template <typename AdapterT>
 DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -252,6 +252,31 @@ EllpackPageImpl::EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense
 ELLPACK_SPECIALIZATION(data::CudfAdapter)
 ELLPACK_SPECIALIZATION(data::CupyAdapter)
 template <typename AdapterBatch>
 EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
                                 bool is_dense, int nthread,
                                 common::Span<size_t> row_counts_span,
                                 size_t row_stride, size_t n_rows, size_t n_cols,
                                 common::HistogramCuts const& cuts) {
  dh::safe_cuda(cudaSetDevice(device));
  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
  CopyDataToEllpack(batch, this, device, missing);
  WriteNullValues(this, device, row_counts_span);
 }
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                             \
  template EllpackPageImpl::EllpackPageImpl(                            \
      __BATCH_T batch, float missing, int device,                       \
      bool is_dense, int nthread,                                       \
      common::Span<size_t> row_counts_span,                             \
      size_t row_stride, size_t n_rows, size_t n_cols,                  \
      common::HistogramCuts const& cuts);
 ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
 ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
 // A functor that copies the data from one EllpackPage to another.
 struct CopyPage {
  common::CompressedBufferWriter cbw;
@@ -279,6 +304,10 @@ size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
  CHECK_EQ(row_stride, page->row_stride);
  CHECK_EQ(NumSymbols(), page->NumSymbols());
  CHECK_GE(n_rows * row_stride, offset + num_elements);
  if (page == this) {
    LOG(FATAL) << "Concatenating the same Ellpack.";
    return this->n_rows * this->row_stride;
  }
  gidx_buffer.SetDevice(device);
  page->gidx_buffer.SetDevice(device);
  dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -149,7 +149,7 @@ class EllpackPageImpl {
  EllpackPageImpl(int device, common::HistogramCuts cuts,
                  const SparsePage& page,
-                  bool is_dense,size_t row_stride);
+                  bool is_dense, size_t row_stride);
  /*!
   * \brief Constructor from an existing DMatrix.
@@ -161,8 +161,16 @@ class EllpackPageImpl {
  template <typename AdapterT>
  explicit EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense, int nthread,
-                           int max_bin, common::Span<size_t> row_counts_span,
+                           int max_bin,
                           common::Span<size_t> row_counts_span,
                           size_t row_stride);
  template <typename AdapterBatch>
  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense, int nthread,
                           common::Span<size_t> row_counts_span,
                           size_t row_stride, size_t n_rows, size_t n_cols,
                           common::HistogramCuts const& cuts);
  /*! \brief Copy the elements of the given ELLPACK page into this page.
   *
   * @param device The GPU device to use.
--- a/src/data/iterative_device_dmatrix.cu
+++ b/src/data/iterative_device_dmatrix.cu
@@ -0,0 +1,188 @@
 /*!
 * Copyright 2020 XGBoost contributors
 */
 #include <memory>
 #include <type_traits>
 #include <algorithm>
 #include "../common/hist_util.cuh"
 #include "simple_batch_iterator.h"
 #include "iterative_device_dmatrix.h"
 #include "sparse_page_source.h"
 #include "ellpack_page.cuh"
 #include "proxy_dmatrix.h"
 #include "device_adapter.cuh"
 namespace xgboost {
 namespace data {
 template <typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
    auto value = dmlc::get<std::shared_ptr<CupyAdapter>>(
        proxy->Adapter())->Value();
    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
        proxy->Adapter())->Value();
    return fn(value);
  } else {
    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
        proxy->Adapter())->Value();
    return fn(value);
  }
 }
 void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missing, int nthread) {
  // A handle passed to external iterator.
  auto handle = static_cast<std::shared_ptr<DMatrix>*>(proxy_);
  CHECK(handle);
  DMatrixProxy* proxy = static_cast<DMatrixProxy*>(handle->get());
  CHECK(proxy);
  // The external iterator
  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
    iter_handle, reset_, next_};
  dh::XGBCachingDeviceAllocator<char> alloc;
  auto num_rows = [&]() {
    return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
  };
  auto num_cols = [&]() {
    return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
  };
  size_t row_stride = 0;
  size_t nnz = 0;
  // Sketch for all batches.
  iter.Reset();
  common::HistogramCuts cuts;
  common::DenseCuts dense_cuts(&cuts);
  std::vector<common::SketchContainer> sketch_containers;
  size_t batches = 0;
  size_t accumulated_rows = 0;
  bst_feature_t cols = 0;
  while (iter.Next()) {
    auto device = proxy->DeviceIdx();
    dh::safe_cuda(cudaSetDevice(device));
    if (cols == 0) {
      cols = num_cols();
    } else {
      CHECK_EQ(cols, num_cols()) << "Inconsistent number of columns.";
    }
    sketch_containers.emplace_back(batch_param_.max_bin, num_cols(), num_rows());
    auto* p_sketch = &sketch_containers.back();
    if (proxy->Info().weights_.Size() != 0) {
      proxy->Info().weights_.SetDevice(device);
      Dispatch(proxy, [&](auto const &value) {
          common::AdapterDeviceSketchWeighted(value, batch_param_.max_bin,
                                              proxy->Info(),
                                              missing, device, p_sketch);
        });
    } else {
      Dispatch(proxy, [&](auto const &value) {
          common::AdapterDeviceSketch(value, batch_param_.max_bin, missing,
                                      device, p_sketch);
        });
    }
    auto batch_rows = num_rows();
    accumulated_rows += batch_rows;
    dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(),
                                         row_counts.size());
    row_stride =
        std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
                   return GetRowCounts(value, row_counts_span, device, missing);
                 }));
    nnz += thrust::reduce(thrust::cuda::par(alloc),
                          row_counts.begin(), row_counts.end());
    batches++;
  }
  // Merging multiple batches for each column
  std::vector<common::WQSketch::SummaryContainer> summary_array(cols);
  size_t intermediate_num_cuts = std::min(
      accumulated_rows, static_cast<size_t>(batch_param_.max_bin *
                                            common::SketchContainer::kFactor));
  size_t nbytes =
      common::WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
 #pragma omp parallel for num_threads(nthread) if (nthread > 0)
  for (omp_ulong c = 0; c < cols; ++c) {
    for (auto& sketch_batch : sketch_containers) {
      common::WQSketch::SummaryContainer summary;
      sketch_batch.sketches_.at(c).GetSummary(&summary);
      sketch_batch.sketches_.at(c).Init(0, 1);
      summary_array.at(c).Reduce(summary, nbytes);
    }
  }
  sketch_containers.clear();
  // Build the final summary.
  std::vector<common::WQSketch> sketches(cols);
 #pragma omp parallel for num_threads(nthread) if (nthread > 0)
  for (omp_ulong c = 0; c < cols; ++c) {
    sketches.at(c).Init(
        accumulated_rows,
        1.0 / (common::SketchContainer::kFactor * batch_param_.max_bin));
    sketches.at(c).PushSummary(summary_array.at(c));
  }
  dense_cuts.Init(&sketches, batch_param_.max_bin, accumulated_rows);
  summary_array.clear();
  this->info_.num_col_ = cols;
  this->info_.num_row_ = accumulated_rows;
  this->info_.num_nonzero_ = nnz;
  // Construct the final ellpack page.
  page_.reset(new EllpackPage);
  *(page_->Impl()) = EllpackPageImpl(proxy->DeviceIdx(), cuts, this->IsDense(),
                                     row_stride, accumulated_rows);
  size_t offset = 0;
  iter.Reset();
  while (iter.Next()) {
    auto device = proxy->DeviceIdx();
    dh::safe_cuda(cudaSetDevice(device));
    auto rows = num_rows();
    dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(),
                                         row_counts.size());
    Dispatch(proxy, [=](auto const& value) {
        return GetRowCounts(value, row_counts_span, device, missing);
      });
    auto is_dense = this->IsDense();
    auto new_impl = Dispatch(proxy, [&](auto const &value) {
      return EllpackPageImpl(value, missing, device, is_dense, nthread,
                             row_counts_span, row_stride, rows, cols, cuts);
    });
    size_t num_elements = page_->Impl()->Copy(device, &new_impl, offset);
    offset += num_elements;
    proxy->Info().num_row_ = num_rows();
    proxy->Info().num_col_ = cols;
    if (batches != 1) {
      this->info_.Extend(std::move(proxy->Info()), false);
    }
  }
  if (batches == 1) {
    this->info_ = std::move(proxy->Info());
    CHECK_EQ(proxy->Info().labels_.Size(), 0);
  }
  iter.Reset();
  // Synchronise worker columns
  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
 }
 BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
  CHECK(page_);
  auto begin_iter =
      BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_.get()));
  return BatchSet<EllpackPage>(begin_iter);
 }
 }  // namespace data
 }  // namespace xgboost
--- a/src/data/iterative_device_dmatrix.h
+++ b/src/data/iterative_device_dmatrix.h
@@ -0,0 +1,76 @@
 /*!
 * Copyright 2020 by Contributors
 * \file iterative_device_dmatrix.h
 */
 #ifndef XGBOOST_DATA_ITERATIVE_DEVICE_DMATRIX_H_
 #define XGBOOST_DATA_ITERATIVE_DEVICE_DMATRIX_H_
 #include <vector>
 #include <string>
 #include <utility>
 #include <memory>
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/c_api.h"
 #include "proxy_dmatrix.h"
 namespace xgboost {
 namespace data {
 class IterativeDeviceDMatrix : public DMatrix {
  MetaInfo info_;
  BatchParam batch_param_;
  std::shared_ptr<EllpackPage> page_;
  DMatrixHandle proxy_;
  DataIterResetCallback *reset_;
  XGDMatrixCallbackNext *next_;
 public:
  void Initialize(DataIterHandle iter, float missing, int nthread);
 public:
  explicit IterativeDeviceDMatrix(DataIterHandle iter, DMatrixHandle proxy,
                                  DataIterResetCallback *reset,
                                  XGDMatrixCallbackNext *next, float missing,
                                  int nthread, int max_bin)
      : proxy_{proxy}, reset_{reset}, next_{next} {
    batch_param_ = BatchParam{0, max_bin, 0};
    this->Initialize(iter, missing, nthread);
  }
  bool EllpackExists() const override { return true; }
  bool SparsePageExists() const override { return false; }
  DMatrix *Slice(common::Span<int32_t const> ridxs) override {
    LOG(FATAL) << "Slicing DMatrix is not supported for Device DMatrix.";
    return nullptr;
  }
  BatchSet<SparsePage> GetRowBatches() override {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
  }
  BatchSet<CSCPage> GetColumnBatches() override {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<CSCPage>(BatchIterator<CSCPage>(nullptr));
  }
  BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(nullptr));
  }
  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
  bool SingleColBlock() const override { return false; }
  MetaInfo& Info() override {
    return info_;
  }
  MetaInfo const& Info() const override {
    return info_;
  }
 };
 }  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_ITERATIVE_DEVICE_DMATRIX_H_
--- a/tests/cpp/data/test_iterative_device_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_device_dmatrix.cu
@@ -0,0 +1,166 @@
 /*!
 * Copyright 2020 XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include "../helpers.h"
 #include "../../../src/data/iterative_device_dmatrix.h"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/device_adapter.cuh"
 namespace xgboost {
 namespace data {
 void TestEquivalent(float sparsity) {
  CudaArrayIterForTest iter{sparsity};
  IterativeDeviceDMatrix m(
      &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
      0, 256);
  size_t offset = 0;
  auto first = (*m.GetEllpackBatches({}).begin()).Impl();
  std::unique_ptr<EllpackPageImpl> page_concatenated {
    new EllpackPageImpl(0, first->Cuts(), first->is_dense,
                        first->row_stride, 1000 * 100)};
  for (auto& batch : m.GetBatches<EllpackPage>()) {
    auto page = batch.Impl();
    size_t num_elements = page_concatenated->Copy(0, page, offset);
    offset += num_elements;
  }
  auto from_iter = page_concatenated->GetDeviceAccessor(0);
  ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::kCols);
  ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::kRows);
  std::string interface_str = iter.AsArray();
  auto adapter = CupyAdapter(interface_str);
  std::unique_ptr<DMatrix> dm{
      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
  BatchParam bp {0, 256};
  for (auto& ellpack : dm->GetBatches<EllpackPage>(bp)) {
    auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
    std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
    std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
    std::vector<uint32_t> cut_ptrs_iter(from_iter.feature_segments.size());
    dh::CopyDeviceSpanToVector(&cuts_from_iter, from_iter.gidx_fvalue_map);
    dh::CopyDeviceSpanToVector(&min_fvalues_iter, from_iter.min_fvalue);
    dh::CopyDeviceSpanToVector(&cut_ptrs_iter, from_iter.feature_segments);
    std::vector<float> cuts_from_data(from_data.gidx_fvalue_map.size());
    std::vector<float> min_fvalues_data(from_data.min_fvalue.size());
    std::vector<uint32_t> cut_ptrs_data(from_data.feature_segments.size());
    dh::CopyDeviceSpanToVector(&cuts_from_data, from_data.gidx_fvalue_map);
    dh::CopyDeviceSpanToVector(&min_fvalues_data, from_data.min_fvalue);
    dh::CopyDeviceSpanToVector(&cut_ptrs_data, from_data.feature_segments);
    ASSERT_EQ(cuts_from_iter.size(), cuts_from_data.size());
    for (size_t i = 0; i < cuts_from_iter.size(); ++i) {
      EXPECT_NEAR(cuts_from_iter[i], cuts_from_data[i], kRtEps);
    }
    ASSERT_EQ(min_fvalues_iter.size(), min_fvalues_data.size());
    for (size_t i = 0; i < min_fvalues_iter.size(); ++i) {
      ASSERT_NEAR(min_fvalues_iter[i], min_fvalues_data[i], kRtEps);
    }
    ASSERT_EQ(cut_ptrs_iter.size(), cut_ptrs_data.size());
    for (size_t i = 0; i < cut_ptrs_iter.size(); ++i) {
      ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]);
    }
    auto const& buffer_from_iter = page_concatenated->gidx_buffer;
    auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
    ASSERT_NE(buffer_from_data.Size(), 0);
    ASSERT_EQ(buffer_from_data.ConstHostVector(), buffer_from_data.ConstHostVector());
  }
 }
 TEST(IterativeDeviceDMatrix, Basic) {
  TestEquivalent(0.0);
  TestEquivalent(0.5);
 }
 TEST(IterativeDeviceDMatrix, RowMajor) {
  CudaArrayIterForTest iter(0.0f);
  IterativeDeviceDMatrix m(
      &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
      0, 256);
  size_t n_batches = 0;
  std::string interface_str = iter.AsArray();
  for (auto& ellpack : m.GetBatches<EllpackPage>()) {
    n_batches ++;
    auto impl = ellpack.Impl();
    common::CompressedIterator<uint32_t> iterator(
        impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
    auto cols = CudaArrayIterForTest::kCols;
    auto rows = CudaArrayIterForTest::kRows;
    auto j_interface =
        Json::Load({interface_str.c_str(), interface_str.size()});
    ArrayInterface loaded {get<Object const>(j_interface)};
    std::vector<float> h_data(cols * rows);
    common::Span<float> s_data{static_cast<float*>(loaded.data), cols * rows};
    dh::CopyDeviceSpanToVector(&h_data, s_data);
    for(auto i = 0ull; i < rows * cols; i++) {
      int column_idx = i % cols;
      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]);
    }
    EXPECT_EQ(m.Info().num_col_, cols);
    EXPECT_EQ(m.Info().num_row_, rows);
    EXPECT_EQ(m.Info().num_nonzero_, rows * cols);
  }
  // All batches are concatenated.
  ASSERT_EQ(n_batches, 1);
 }
 TEST(IterativeDeviceDMatrix, RowMajorMissing) {
  const float kMissing = std::numeric_limits<float>::quiet_NaN();
  size_t rows = 10;
  size_t cols = 2;
  CudaArrayIterForTest iter(0.0f, rows, cols, 2);
  std::string interface_str = iter.AsArray();
  auto j_interface =
      Json::Load({interface_str.c_str(), interface_str.size()});
  ArrayInterface loaded {get<Object const>(j_interface)};
  std::vector<float> h_data(cols * rows);
  common::Span<float> s_data{static_cast<float*>(loaded.data), cols * rows};
  dh::CopyDeviceSpanToVector(&h_data, s_data);
  h_data[1] = kMissing;
  h_data[5] = kMissing;
  h_data[6] = kMissing;
  auto ptr = thrust::device_ptr<float>(
      reinterpret_cast<float *>(get<Integer>(j_interface["data"][0])));
  thrust::copy(h_data.cbegin(), h_data.cend(), ptr);
  IterativeDeviceDMatrix m(
      &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
      0, 256);
  auto &ellpack = *m.GetBatches<EllpackPage>({0, 256, 0}).begin();
  auto impl = ellpack.Impl();
  common::CompressedIterator<uint32_t> iterator(
      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
  // null values get placed after valid values in a row
  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
  EXPECT_EQ(m.Info().num_col_, cols);
  EXPECT_EQ(m.Info().num_row_, rows);
  EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
 }
 TEST(IterativeDeviceDMatrix, IsDense) {
  int num_bins = 16;
  auto test = [num_bins] (float sparsity) {
    CudaArrayIterForTest iter(sparsity);
    IterativeDeviceDMatrix m(
        &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
        0, 256);
    if (sparsity == 0.0) {
      ASSERT_TRUE(m.IsDense());
    } else {
      ASSERT_FALSE(m.IsDense());
    }
  };
  test(0.0);
  test(0.1);
 }
 }  // namespace data
 }  // namespace xgboost
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -1,17 +1,43 @@
 #include <xgboost/c_api.h>
 #include "helpers.h"
 #include "../../src/data/device_adapter.cuh"
-#include "../../src/data/device_dmatrix.h"
+#include "../../src/data/iterative_device_dmatrix.h"
 namespace xgboost {
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
                                           size_t cols, size_t batches)
    : rows_{rows}, cols_{cols}, n_batches_{batches} {
  XGProxyDMatrixCreate(&proxy_);
  rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
  rng_->Device(0);
  std::tie(batches_, interface_) =
      rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
  this->Reset();
 }
 CudaArrayIterForTest::~CudaArrayIterForTest() { XGDMatrixFree(proxy_); }
 int CudaArrayIterForTest::Next() {
  if (iter_ == n_batches_) {
    return 0;
  }
  XGDeviceQuantileDMatrixSetDataCudaArrayInterface(proxy_, batches_[iter_].c_str());
  iter_++;
  return 1;
 }
 size_t constexpr CudaArrayIterForTest::kRows;
 size_t constexpr CudaArrayIterForTest::kCols;
 std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label,
                                                                    bool float_label,
                                                                    size_t classes) {
-  std::vector<HostDeviceVector<float>> storage(cols_);
+  CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
-  std::string arr = this->GenerateColumnarArrayInterface(&storage);
+  auto m = std::make_shared<data::IterativeDeviceDMatrix>(
-  auto adapter = data::CudfAdapter(arr);
+      &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
-  std::shared_ptr<DMatrix> m {
+      0, bins_);
    new data::DeviceDMatrix{&adapter,
          std::numeric_limits<float>::quiet_NaN(), 1, 256}};
  return m;
 }
 }  // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -304,5 +304,51 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
  HostDeviceVector<GradientPair> gpair(h_gpair);
  return gpair;
 }
 typedef void *DMatrixHandle;  // NOLINT(*);
 class CudaArrayIterForTest {
  HostDeviceVector<float> data_;
  size_t iter_ {0};
  DMatrixHandle proxy_;
  std::unique_ptr<RandomDataGenerator> rng_;
  std::vector<std::string> batches_;
  std::string interface_;
  size_t rows_;
  size_t cols_;
  size_t n_batches_;
 public:
  size_t static constexpr kRows { 1000 };
  size_t static constexpr kBatches { 100 };
  size_t static constexpr kCols { 13 };
  explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
                                size_t cols = kCols, size_t batches = kBatches);
  ~CudaArrayIterForTest();
  std::string AsArray() const {
    return interface_;
  }
  int Next();
  void Reset() {
    iter_ = 0;
  }
  size_t Iter() const { return iter_; }
  auto Proxy() -> decltype(proxy_) { return proxy_; }
 };
 typedef void *DataIterHandle;  // NOLINT(*)
 inline void Reset(DataIterHandle self) {
  static_cast<CudaArrayIterForTest*>(self)->Reset();
 }
 inline int Next(DataIterHandle self) {
  return static_cast<CudaArrayIterForTest*>(self)->Next();
 }
 }  // namespace xgboost
 #endif
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -76,15 +76,15 @@ TEST(GPUPredictor, EllpackTraining) {
       .Bins(kBins)
       .Device(0)
       .GenerateDeviceDMatrix(true);
-  std::vector<HostDeviceVector<float>> storage(kCols);
+  HostDeviceVector<float> storage(kRows * kCols);
  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
       .Device(0)
-       .GenerateColumnarArrayInterface(&storage);
+       .GenerateArrayInterface(&storage);
-  auto adapter = data::CudfAdapter(columnar);
+  auto adapter = data::CupyAdapter(columnar);
  std::shared_ptr<DMatrix> p_full {
    DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
  };
-  TestTrainingPrediction(kRows, "gpu_hist", p_full, p_ellpack);
+  TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
 }
 TEST(GPUPredictor, ExternalMemoryTest) {
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -32,7 +32,8 @@ TEST(Predictor, PredictionCache) {
  EXPECT_ANY_THROW(container.Entry(m));
 }
-void TestTrainingPrediction(size_t rows, std::string tree_method,
+void TestTrainingPrediction(size_t rows, size_t bins,
                            std::string tree_method,
                            std::shared_ptr<DMatrix> p_full,
                            std::shared_ptr<DMatrix> p_hist) {
  size_t constexpr kCols = 16;
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -52,7 +52,7 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
 }
 // p_full and p_hist should come from the same data set.
-void TestTrainingPrediction(size_t rows, std::string tree_method,
+void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
                            std::shared_ptr<DMatrix> p_full,
                            std::shared_ptr<DMatrix> p_hist);