Implement iterative DMatrix. (#5837)

This commit is contained in:
Jiaming Yuan 2020-07-03 11:44:52 +08:00 committed by GitHub
parent 4d277d750d
commit 1a0801238e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 855 additions and 84 deletions

View File

@ -26,39 +26,10 @@
// manually define unsigned long // manually define unsigned long
typedef uint64_t bst_ulong; // NOLINT(*) typedef uint64_t bst_ulong; // NOLINT(*)
/*! \brief handle to DMatrix */ /*! \brief handle to DMatrix */
typedef void *DMatrixHandle; // NOLINT(*) typedef void *DMatrixHandle; // NOLINT(*)
/*! \brief handle to Booster */ /*! \brief handle to Booster */
typedef void *BoosterHandle; // NOLINT(*) typedef void *BoosterHandle; // NOLINT(*)
/*! \brief handle to a data iterator */
typedef void *DataIterHandle; // NOLINT(*)
/*! \brief handle to a internal data holder. */
typedef void *DataHolderHandle; // NOLINT(*)
/*! \brief Mini batch used in XGBoost Data Iteration */
typedef struct { // NOLINT(*)
/*! \brief number of rows in the minibatch */
size_t size;
/* \brief number of columns in the minibatch. */
size_t columns;
/*! \brief row pointer to the rows in the data */
#ifdef __APPLE__
/* Necessary as Java on MacOS defines jlong as long int
* and gcc defines int64_t as long long int. */
long* offset; // NOLINT(*)
#else
int64_t* offset; // NOLINT(*)
#endif // __APPLE__
/*! \brief labels of each instance */
float* label;
/*! \brief weight of each instance, can be NULL */
float* weight;
/*! \brief feature index */
int* index;
/*! \brief feature values */
float* value;
} XGBoostBatchCSR;
/*! /*!
* \brief Return the version of the XGBoost library being currently used. * \brief Return the version of the XGBoost library being currently used.
@ -71,29 +42,6 @@ typedef struct { // NOLINT(*)
*/ */
XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch); XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch);
/*!
* \brief Callback to set the data to handle,
* \param handle The handle to the callback.
* \param batch The data content to be set.
*/
XGB_EXTERN_C typedef int XGBCallbackSetData( // NOLINT(*)
DataHolderHandle handle, XGBoostBatchCSR batch);
/*!
* \brief The data reading callback function.
* The iterator will be able to give subset of batch in the data.
*
* If there is data, the function will call set_function to set the data.
*
* \param data_handle The handle to the callback.
* \param set_function The batch returned by the iterator
* \param set_function_handle The handle to be passed to set function.
* \return 0 if we are reaching the end and batch is not returned.
*/
XGB_EXTERN_C typedef int XGBCallbackDataIterNext( // NOLINT(*)
DataIterHandle data_handle, XGBCallbackSetData *set_function,
DataHolderHandle set_function_handle);
/*! /*!
* \brief get string message of the last error * \brief get string message of the last error
* *
@ -126,20 +74,6 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname,
int silent, int silent,
DMatrixHandle *out); DMatrixHandle *out);
/*!
* \brief Create a DMatrix from a data iterator.
* \param data_handle The handle to the data.
* \param callback The callback to get the data.
* \param cache_info Additional information about cache file, can be null.
* \param out The created DMatrix
* \return 0 when success, -1 when failure happens.
*/
XGB_DLL int XGDMatrixCreateFromDataIter(
DataIterHandle data_handle,
XGBCallbackDataIterNext* callback,
const char* cache_info,
DMatrixHandle *out);
/*! /*!
* \brief create a matrix content from CSR format * \brief create a matrix content from CSR format
* \param indptr pointer to row headers * \param indptr pointer to row headers
@ -221,6 +155,189 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
bst_ulong ncol, bst_ulong ncol,
DMatrixHandle* out, DMatrixHandle* out,
int nthread); int nthread);
/*
* ========================== Begin data callback APIs =========================
*
* Short notes for data callback
*
* There are 2 sets of data callbacks for DMatrix. The first one is currently exclusively
* used by JVM packages. It uses `XGBoostBatchCSR` to accept batches for CSR formated
* input, and concatenate them into 1 final big CSR. The related functions are:
*
* - XGBCallbackSetData
* - XGBCallbackDataIterNext
* - XGDMatrixCreateFromDataIter
*
* Another set is used by Quantile based DMatrix (used by hist algorithm) for reducing
* memory usage. Currently only GPU implementation is available. It accept foreign data
* iterators as callbacks and works similar to external memory. For GPU Hist, the data is
* first compressed by quantile sketching then merged. This is particular useful for
* distributed setting as it eliminates 2 copies of data. 1 by a `concat` from external
* library to make the data into a blob for normal DMatrix initialization, another by the
* internal CSR copy of DMatrix. Related functions are:
*
* - XGProxyDMatrixCreate
* - XGDMatrixCallbackNext
* - DataIterResetCallback
* - XGDeviceQuantileDMatrixSetDataCudaArrayInterface
* - XGDeviceQuantileDMatrixSetDataCudaColumnar
* - ... (data setters)
*/
/* ==== First set of callback functions, used exclusively by JVM packages. ==== */
/*! \brief handle to a external data iterator */
typedef void *DataIterHandle; // NOLINT(*)
/*! \brief handle to a internal data holder. */
typedef void *DataHolderHandle; // NOLINT(*)
/*! \brief Mini batch used in XGBoost Data Iteration */
typedef struct { // NOLINT(*)
/*! \brief number of rows in the minibatch */
size_t size;
/* \brief number of columns in the minibatch. */
size_t columns;
/*! \brief row pointer to the rows in the data */
#ifdef __APPLE__
/* Necessary as Java on MacOS defines jlong as long int
* and gcc defines int64_t as long long int. */
long* offset; // NOLINT(*)
#else
int64_t* offset; // NOLINT(*)
#endif // __APPLE__
/*! \brief labels of each instance */
float* label;
/*! \brief weight of each instance, can be NULL */
float* weight;
/*! \brief feature index */
int* index;
/*! \brief feature values */
float* value;
} XGBoostBatchCSR;
/*!
* \brief Callback to set the data to handle,
* \param handle The handle to the callback.
* \param batch The data content to be set.
*/
XGB_EXTERN_C typedef int XGBCallbackSetData( // NOLINT(*)
DataHolderHandle handle, XGBoostBatchCSR batch);
/*!
* \brief The data reading callback function.
* The iterator will be able to give subset of batch in the data.
*
* If there is data, the function will call set_function to set the data.
*
* \param data_handle The handle to the callback.
* \param set_function The batch returned by the iterator
* \param set_function_handle The handle to be passed to set function.
* \return 0 if we are reaching the end and batch is not returned.
*/
XGB_EXTERN_C typedef int XGBCallbackDataIterNext( // NOLINT(*)
DataIterHandle data_handle, XGBCallbackSetData *set_function,
DataHolderHandle set_function_handle);
/*!
* \brief Create a DMatrix from a data iterator.
* \param data_handle The handle to the data.
* \param callback The callback to get the data.
* \param cache_info Additional information about cache file, can be null.
* \param out The created DMatrix
* \return 0 when success, -1 when failure happens.
*/
XGB_DLL int XGDMatrixCreateFromDataIter(
DataIterHandle data_handle,
XGBCallbackDataIterNext* callback,
const char* cache_info,
DMatrixHandle *out);
/* == Second set of callback functions, used by constructing Quantile based DMatrix. ===
*
* Short note for how to use the second set of callback for GPU Hist tree method.
*
* Step 0: Define a data iterator with 2 methods `reset`, and `next`.
* Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
* Step 2: Pass the iterator handle, proxy handle and 2 methods into
* `XGDeviceQuantileDMatrixCreateFromCallback`.
* Step 3: Call appropriate data setters in `next` functions.
*
* See test_iterative_device_dmatrix.cu or Python interface for examples.
*/
/*!
* \brief Create a DMatrix proxy for setting data, can be free by XGDMatrixFree.
*
* \param out The created Device Quantile DMatrix
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out);
/*!
* \brief Callback function prototype for getting next batch of data.
*
* \param iter A handler to the user defined iterator.
*
* \return 0 when success, -1 when failure happens
*/
XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter); // NOLINT(*)
/*!
* \brief Callback function prototype for reseting external iterator
*/
XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*)
/*!
* \brief Create a device DMatrix with data iterator.
*
* \param iter A handle to external data iterator.
* \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
* \param reset Callback function reseting the iterator state.
* \param next Callback function yieling the next batch of data.
* \param missing Which value to represent missing value
* \param nthread Number of threads to use, 0 for default.
* \param max_bin Maximum number of bins for building histogram.
* \param out The created Device Quantile DMatrix
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
DMatrixHandle *out);
/*!
* \brief Set data on a DMatrix proxy.
*
* \param handle A DMatrix proxy created by XGProxyDMatrixCreate
* \param c_interface_str Null terminated JSON document string representation of CUDA
* array interface.
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDeviceQuantileDMatrixSetDataCudaArrayInterface(
DMatrixHandle handle,
const char* c_interface_str);
/*!
* \brief Set data on a DMatrix proxy.
*
* \param handle A DMatrix proxy created by XGProxyDMatrixCreate
* \param c_interface_str Null terminated JSON document string representation of CUDA
* array interface, with an array of columns.
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDeviceQuantileDMatrixSetDataCudaColumnar(
DMatrixHandle handle,
const char* c_interface_str);
/*
* ==========================- End data callback APIs ==========================
*/
/*! /*!
* \brief create a new dmatrix from sliced content of existing matrix * \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced * \param handle instance of data matrix to be sliced
@ -261,6 +378,18 @@ XGB_DLL int XGDMatrixFree(DMatrixHandle handle);
*/ */
XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle, XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle,
const char *fname, int silent); const char *fname, int silent);
/*!
* \brief Set content in array interface to a content in info.
* \param handle a instance of data matrix
* \param field field name.
* \param c_interface_str JSON string representation of array interface.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle,
char const* field,
char const* c_interface_str);
/*! /*!
* \brief set float vector to a content in info * \brief set float vector to a content in info
* \param handle a instance of data matrix * \param handle a instance of data matrix
@ -437,6 +566,10 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
int training, int training,
bst_ulong *out_len, bst_ulong *out_len,
const float **out_result); const float **out_result);
/*
* ========================== Begin Serialization APIs =========================
*/
/* /*
* Short note for serialization APIs. There are 3 different sets of serialization API. * Short note for serialization APIs. There are 3 different sets of serialization API.
* *
@ -559,6 +692,10 @@ XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle, bst_ulong *out_len,
*/ */
XGB_DLL int XGBoosterLoadJsonConfig(BoosterHandle handle, XGB_DLL int XGBoosterLoadJsonConfig(BoosterHandle handle,
char const *json_parameters); char const *json_parameters);
/*
* =========================== End Serialization APIs ==========================
*/
/*! /*!
* \brief dump model, return array of strings representing model dump * \brief dump model, return array of strings representing model dump

View File

@ -502,7 +502,33 @@ class DMatrix {
const std::string& cache_prefix = "", const std::string& cache_prefix = "",
size_t page_size = kPageSize); size_t page_size = kPageSize);
virtual DMatrix* Slice(common::Span<int32_t const> ridxs) = 0; /**
* \brief Create a new Quantile based DMatrix used for histogram based algorithm.
*
* \tparam DataIterHandle External iterator type, defined in C API.
* \tparam DMatrixHandle DMatrix handle, defined in C API.
* \tparam DataIterResetCallback Callback for reset, prototype defined in C API.
* \tparam XGDMatrixCallbackNext Callback for next, prototype defined in C API.
*
* \param iter External data iterator
* \param proxy A hanlde to ProxyDMatrix
* \param reset Callback for reset
* \param next Callback for next
* \param missing Value that should be treated as missing.
* \param nthread number of threads used for initialization.
* \param max_bin Maximum number of bins.
*
* \return A created quantile based DMatrix.
*/
template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
static DMatrix *Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int nthread,
int max_bin);
virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
/*! \brief page size 32 MB */ /*! \brief page size 32 MB */
static const size_t kPageSize = 32UL << 20UL; static const size_t kPageSize = 32UL << 20UL;

View File

@ -23,6 +23,7 @@
#include "../common/io.h" #include "../common/io.h"
#include "../data/adapter.h" #include "../data/adapter.h"
#include "../data/simple_dmatrix.h" #include "../data/simple_dmatrix.h"
#include "../data/proxy_dmatrix.h"
using namespace xgboost; // NOLINT(*); using namespace xgboost; // NOLINT(*);
@ -101,6 +102,50 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
#endif #endif
// Create from data iterator
XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
API_BEGIN();
*out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
API_END();
}
XGB_DLL int
XGDeviceQuantileDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
char const *c_interface_str) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
CHECK(p_m);
auto m = static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
CHECK(m) << "Current DMatrix type does not support set data.";
m->SetData(c_interface_str);
API_END();
}
XGB_DLL int
XGDeviceQuantileDMatrixSetDataCudaColumnar(DMatrixHandle handle,
char const *c_interface_str) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
CHECK(p_m);
auto m = static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
CHECK(m) << "Current DMatrix type does not support set data.";
m->SetData(c_interface_str);
API_END();
}
XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int nthread,
int max_bin, DMatrixHandle *out) {
API_BEGIN();
*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)};
API_END();
}
// End Create from data iterator
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
const unsigned* indices, const unsigned* indices,
const bst_float* data, const bst_float* data,

View File

@ -68,9 +68,11 @@ struct SketchContainer {
// Prevent copying/assigning/moving this as its internals can't be // Prevent copying/assigning/moving this as its internals can't be
// assigned/copied/moved // assigned/copied/moved
SketchContainer(const SketchContainer&) = delete; SketchContainer(const SketchContainer&) = delete;
SketchContainer(const SketchContainer&&) = delete; SketchContainer(SketchContainer&& that) {
std::swap(sketches_, that.sketches_);
}
SketchContainer& operator=(const SketchContainer&) = delete; SketchContainer& operator=(const SketchContainer&) = delete;
SketchContainer& operator=(const SketchContainer&&) = delete; SketchContainer& operator=(SketchContainer&&) = delete;
}; };
struct EntryCompareOp { struct EntryCompareOp {

View File

@ -19,6 +19,7 @@
#include "../common/version.h" #include "../common/version.h"
#include "../common/group_data.h" #include "../common/group_data.h"
#include "../data/adapter.h" #include "../data/adapter.h"
#include "../data/iterative_device_dmatrix.h"
#if DMLC_ENABLE_STD_THREAD #if DMLC_ENABLE_STD_THREAD
#include "./sparse_page_source.h" #include "./sparse_page_source.h"
@ -569,6 +570,26 @@ DMatrix* DMatrix::Load(const std::string& uri,
} }
return dmat; return dmat;
} }
template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int nthread,
int max_bin) {
#if defined(XGBOOST_USE_CUDA)
return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing, nthread, max_bin);
#else
common::AssertGPUSupport();
return nullptr;
#endif
}
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
DataIterResetCallback, XGDMatrixCallbackNext>(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int nthread,
int max_bin);
template <typename AdapterT> template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,

View File

@ -252,6 +252,31 @@ EllpackPageImpl::EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense
ELLPACK_SPECIALIZATION(data::CudfAdapter) ELLPACK_SPECIALIZATION(data::CudfAdapter)
ELLPACK_SPECIALIZATION(data::CupyAdapter) ELLPACK_SPECIALIZATION(data::CupyAdapter)
template <typename AdapterBatch>
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
bool is_dense, int nthread,
common::Span<size_t> row_counts_span,
size_t row_stride, size_t n_rows, size_t n_cols,
common::HistogramCuts const& cuts) {
dh::safe_cuda(cudaSetDevice(device));
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
CopyDataToEllpack(batch, this, device, missing);
WriteNullValues(this, device, row_counts_span);
}
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
template EllpackPageImpl::EllpackPageImpl( \
__BATCH_T batch, float missing, int device, \
bool is_dense, int nthread, \
common::Span<size_t> row_counts_span, \
size_t row_stride, size_t n_rows, size_t n_cols, \
common::HistogramCuts const& cuts);
ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
// A functor that copies the data from one EllpackPage to another. // A functor that copies the data from one EllpackPage to another.
struct CopyPage { struct CopyPage {
common::CompressedBufferWriter cbw; common::CompressedBufferWriter cbw;
@ -279,6 +304,10 @@ size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols()); CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_GE(n_rows * row_stride, offset + num_elements); CHECK_GE(n_rows * row_stride, offset + num_elements);
if (page == this) {
LOG(FATAL) << "Concatenating the same Ellpack.";
return this->n_rows * this->row_stride;
}
gidx_buffer.SetDevice(device); gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device); page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, num_elements, CopyPage(this, page, offset)); dh::LaunchN(device, num_elements, CopyPage(this, page, offset));

View File

@ -149,7 +149,7 @@ class EllpackPageImpl {
EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl(int device, common::HistogramCuts cuts,
const SparsePage& page, const SparsePage& page,
bool is_dense,size_t row_stride); bool is_dense, size_t row_stride);
/*! /*!
* \brief Constructor from an existing DMatrix. * \brief Constructor from an existing DMatrix.
@ -161,8 +161,16 @@ class EllpackPageImpl {
template <typename AdapterT> template <typename AdapterT>
explicit EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense, int nthread, explicit EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense, int nthread,
int max_bin, common::Span<size_t> row_counts_span, int max_bin,
common::Span<size_t> row_counts_span,
size_t row_stride); size_t row_stride);
template <typename AdapterBatch>
explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense, int nthread,
common::Span<size_t> row_counts_span,
size_t row_stride, size_t n_rows, size_t n_cols,
common::HistogramCuts const& cuts);
/*! \brief Copy the elements of the given ELLPACK page into this page. /*! \brief Copy the elements of the given ELLPACK page into this page.
* *
* @param device The GPU device to use. * @param device The GPU device to use.

View File

@ -0,0 +1,188 @@
/*!
* Copyright 2020 XGBoost contributors
*/
#include <memory>
#include <type_traits>
#include <algorithm>
#include "../common/hist_util.cuh"
#include "simple_batch_iterator.h"
#include "iterative_device_dmatrix.h"
#include "sparse_page_source.h"
#include "ellpack_page.cuh"
#include "proxy_dmatrix.h"
#include "device_adapter.cuh"
namespace xgboost {
namespace data {
template <typename Fn>
decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
auto value = dmlc::get<std::shared_ptr<CupyAdapter>>(
proxy->Adapter())->Value();
return fn(value);
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
proxy->Adapter())->Value();
return fn(value);
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
proxy->Adapter())->Value();
return fn(value);
}
}
void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missing, int nthread) {
// A handle passed to external iterator.
auto handle = static_cast<std::shared_ptr<DMatrix>*>(proxy_);
CHECK(handle);
DMatrixProxy* proxy = static_cast<DMatrixProxy*>(handle->get());
CHECK(proxy);
// The external iterator
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
iter_handle, reset_, next_};
dh::XGBCachingDeviceAllocator<char> alloc;
auto num_rows = [&]() {
return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
};
auto num_cols = [&]() {
return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
};
size_t row_stride = 0;
size_t nnz = 0;
// Sketch for all batches.
iter.Reset();
common::HistogramCuts cuts;
common::DenseCuts dense_cuts(&cuts);
std::vector<common::SketchContainer> sketch_containers;
size_t batches = 0;
size_t accumulated_rows = 0;
bst_feature_t cols = 0;
while (iter.Next()) {
auto device = proxy->DeviceIdx();
dh::safe_cuda(cudaSetDevice(device));
if (cols == 0) {
cols = num_cols();
} else {
CHECK_EQ(cols, num_cols()) << "Inconsistent number of columns.";
}
sketch_containers.emplace_back(batch_param_.max_bin, num_cols(), num_rows());
auto* p_sketch = &sketch_containers.back();
if (proxy->Info().weights_.Size() != 0) {
proxy->Info().weights_.SetDevice(device);
Dispatch(proxy, [&](auto const &value) {
common::AdapterDeviceSketchWeighted(value, batch_param_.max_bin,
proxy->Info(),
missing, device, p_sketch);
});
} else {
Dispatch(proxy, [&](auto const &value) {
common::AdapterDeviceSketch(value, batch_param_.max_bin, missing,
device, p_sketch);
});
}
auto batch_rows = num_rows();
accumulated_rows += batch_rows;
dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(),
row_counts.size());
row_stride =
std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, device, missing);
}));
nnz += thrust::reduce(thrust::cuda::par(alloc),
row_counts.begin(), row_counts.end());
batches++;
}
// Merging multiple batches for each column
std::vector<common::WQSketch::SummaryContainer> summary_array(cols);
size_t intermediate_num_cuts = std::min(
accumulated_rows, static_cast<size_t>(batch_param_.max_bin *
common::SketchContainer::kFactor));
size_t nbytes =
common::WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
#pragma omp parallel for num_threads(nthread) if (nthread > 0)
for (omp_ulong c = 0; c < cols; ++c) {
for (auto& sketch_batch : sketch_containers) {
common::WQSketch::SummaryContainer summary;
sketch_batch.sketches_.at(c).GetSummary(&summary);
sketch_batch.sketches_.at(c).Init(0, 1);
summary_array.at(c).Reduce(summary, nbytes);
}
}
sketch_containers.clear();
// Build the final summary.
std::vector<common::WQSketch> sketches(cols);
#pragma omp parallel for num_threads(nthread) if (nthread > 0)
for (omp_ulong c = 0; c < cols; ++c) {
sketches.at(c).Init(
accumulated_rows,
1.0 / (common::SketchContainer::kFactor * batch_param_.max_bin));
sketches.at(c).PushSummary(summary_array.at(c));
}
dense_cuts.Init(&sketches, batch_param_.max_bin, accumulated_rows);
summary_array.clear();
this->info_.num_col_ = cols;
this->info_.num_row_ = accumulated_rows;
this->info_.num_nonzero_ = nnz;
// Construct the final ellpack page.
page_.reset(new EllpackPage);
*(page_->Impl()) = EllpackPageImpl(proxy->DeviceIdx(), cuts, this->IsDense(),
row_stride, accumulated_rows);
size_t offset = 0;
iter.Reset();
while (iter.Next()) {
auto device = proxy->DeviceIdx();
dh::safe_cuda(cudaSetDevice(device));
auto rows = num_rows();
dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(),
row_counts.size());
Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, device, missing);
});
auto is_dense = this->IsDense();
auto new_impl = Dispatch(proxy, [&](auto const &value) {
return EllpackPageImpl(value, missing, device, is_dense, nthread,
row_counts_span, row_stride, rows, cols, cuts);
});
size_t num_elements = page_->Impl()->Copy(device, &new_impl, offset);
offset += num_elements;
proxy->Info().num_row_ = num_rows();
proxy->Info().num_col_ = cols;
if (batches != 1) {
this->info_.Extend(std::move(proxy->Info()), false);
}
}
if (batches == 1) {
this->info_ = std::move(proxy->Info());
CHECK_EQ(proxy->Info().labels_.Size(), 0);
}
iter.Reset();
// Synchronise worker columns
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
}
BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
CHECK(page_);
auto begin_iter =
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_.get()));
return BatchSet<EllpackPage>(begin_iter);
}
} // namespace data
} // namespace xgboost

View File

@ -0,0 +1,76 @@
/*!
* Copyright 2020 by Contributors
* \file iterative_device_dmatrix.h
*/
#ifndef XGBOOST_DATA_ITERATIVE_DEVICE_DMATRIX_H_
#define XGBOOST_DATA_ITERATIVE_DEVICE_DMATRIX_H_
#include <vector>
#include <string>
#include <utility>
#include <memory>
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/c_api.h"
#include "proxy_dmatrix.h"
namespace xgboost {
namespace data {
class IterativeDeviceDMatrix : public DMatrix {
MetaInfo info_;
BatchParam batch_param_;
std::shared_ptr<EllpackPage> page_;
DMatrixHandle proxy_;
DataIterResetCallback *reset_;
XGDMatrixCallbackNext *next_;
public:
void Initialize(DataIterHandle iter, float missing, int nthread);
public:
explicit IterativeDeviceDMatrix(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int nthread, int max_bin)
: proxy_{proxy}, reset_{reset}, next_{next} {
batch_param_ = BatchParam{0, max_bin, 0};
this->Initialize(iter, missing, nthread);
}
bool EllpackExists() const override { return true; }
bool SparsePageExists() const override { return false; }
DMatrix *Slice(common::Span<int32_t const> ridxs) override {
LOG(FATAL) << "Slicing DMatrix is not supported for Device DMatrix.";
return nullptr;
}
BatchSet<SparsePage> GetRowBatches() override {
LOG(FATAL) << "Not implemented.";
return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
}
BatchSet<CSCPage> GetColumnBatches() override {
LOG(FATAL) << "Not implemented.";
return BatchSet<CSCPage>(BatchIterator<CSCPage>(nullptr));
}
BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
LOG(FATAL) << "Not implemented.";
return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(nullptr));
}
BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
bool SingleColBlock() const override { return false; }
MetaInfo& Info() override {
return info_;
}
MetaInfo const& Info() const override {
return info_;
}
};
} // namespace data
} // namespace xgboost
#endif // XGBOOST_DATA_ITERATIVE_DEVICE_DMATRIX_H_

View File

@ -0,0 +1,166 @@
/*!
* Copyright 2020 XGBoost contributors
*/
#include <gtest/gtest.h>
#include "../helpers.h"
#include "../../../src/data/iterative_device_dmatrix.h"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/device_adapter.cuh"
namespace xgboost {
namespace data {
void TestEquivalent(float sparsity) {
CudaArrayIterForTest iter{sparsity};
IterativeDeviceDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
size_t offset = 0;
auto first = (*m.GetEllpackBatches({}).begin()).Impl();
std::unique_ptr<EllpackPageImpl> page_concatenated {
new EllpackPageImpl(0, first->Cuts(), first->is_dense,
first->row_stride, 1000 * 100)};
for (auto& batch : m.GetBatches<EllpackPage>()) {
auto page = batch.Impl();
size_t num_elements = page_concatenated->Copy(0, page, offset);
offset += num_elements;
}
auto from_iter = page_concatenated->GetDeviceAccessor(0);
ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::kCols);
ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::kRows);
std::string interface_str = iter.AsArray();
auto adapter = CupyAdapter(interface_str);
std::unique_ptr<DMatrix> dm{
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
BatchParam bp {0, 256};
for (auto& ellpack : dm->GetBatches<EllpackPage>(bp)) {
auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
std::vector<uint32_t> cut_ptrs_iter(from_iter.feature_segments.size());
dh::CopyDeviceSpanToVector(&cuts_from_iter, from_iter.gidx_fvalue_map);
dh::CopyDeviceSpanToVector(&min_fvalues_iter, from_iter.min_fvalue);
dh::CopyDeviceSpanToVector(&cut_ptrs_iter, from_iter.feature_segments);
std::vector<float> cuts_from_data(from_data.gidx_fvalue_map.size());
std::vector<float> min_fvalues_data(from_data.min_fvalue.size());
std::vector<uint32_t> cut_ptrs_data(from_data.feature_segments.size());
dh::CopyDeviceSpanToVector(&cuts_from_data, from_data.gidx_fvalue_map);
dh::CopyDeviceSpanToVector(&min_fvalues_data, from_data.min_fvalue);
dh::CopyDeviceSpanToVector(&cut_ptrs_data, from_data.feature_segments);
ASSERT_EQ(cuts_from_iter.size(), cuts_from_data.size());
for (size_t i = 0; i < cuts_from_iter.size(); ++i) {
EXPECT_NEAR(cuts_from_iter[i], cuts_from_data[i], kRtEps);
}
ASSERT_EQ(min_fvalues_iter.size(), min_fvalues_data.size());
for (size_t i = 0; i < min_fvalues_iter.size(); ++i) {
ASSERT_NEAR(min_fvalues_iter[i], min_fvalues_data[i], kRtEps);
}
ASSERT_EQ(cut_ptrs_iter.size(), cut_ptrs_data.size());
for (size_t i = 0; i < cut_ptrs_iter.size(); ++i) {
ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]);
}
auto const& buffer_from_iter = page_concatenated->gidx_buffer;
auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
ASSERT_NE(buffer_from_data.Size(), 0);
ASSERT_EQ(buffer_from_data.ConstHostVector(), buffer_from_data.ConstHostVector());
}
}
TEST(IterativeDeviceDMatrix, Basic) {
TestEquivalent(0.0);
TestEquivalent(0.5);
}
TEST(IterativeDeviceDMatrix, RowMajor) {
CudaArrayIterForTest iter(0.0f);
IterativeDeviceDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
size_t n_batches = 0;
std::string interface_str = iter.AsArray();
for (auto& ellpack : m.GetBatches<EllpackPage>()) {
n_batches ++;
auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
auto cols = CudaArrayIterForTest::kCols;
auto rows = CudaArrayIterForTest::kRows;
auto j_interface =
Json::Load({interface_str.c_str(), interface_str.size()});
ArrayInterface loaded {get<Object const>(j_interface)};
std::vector<float> h_data(cols * rows);
common::Span<float> s_data{static_cast<float*>(loaded.data), cols * rows};
dh::CopyDeviceSpanToVector(&h_data, s_data);
for(auto i = 0ull; i < rows * cols; i++) {
int column_idx = i % cols;
EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]);
}
EXPECT_EQ(m.Info().num_col_, cols);
EXPECT_EQ(m.Info().num_row_, rows);
EXPECT_EQ(m.Info().num_nonzero_, rows * cols);
}
// All batches are concatenated.
ASSERT_EQ(n_batches, 1);
}
TEST(IterativeDeviceDMatrix, RowMajorMissing) {
const float kMissing = std::numeric_limits<float>::quiet_NaN();
size_t rows = 10;
size_t cols = 2;
CudaArrayIterForTest iter(0.0f, rows, cols, 2);
std::string interface_str = iter.AsArray();
auto j_interface =
Json::Load({interface_str.c_str(), interface_str.size()});
ArrayInterface loaded {get<Object const>(j_interface)};
std::vector<float> h_data(cols * rows);
common::Span<float> s_data{static_cast<float*>(loaded.data), cols * rows};
dh::CopyDeviceSpanToVector(&h_data, s_data);
h_data[1] = kMissing;
h_data[5] = kMissing;
h_data[6] = kMissing;
auto ptr = thrust::device_ptr<float>(
reinterpret_cast<float *>(get<Integer>(j_interface["data"][0])));
thrust::copy(h_data.cbegin(), h_data.cend(), ptr);
IterativeDeviceDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256, 0}).begin();
auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
// null values get placed after valid values in a row
EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
EXPECT_EQ(m.Info().num_col_, cols);
EXPECT_EQ(m.Info().num_row_, rows);
EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
}
TEST(IterativeDeviceDMatrix, IsDense) {
int num_bins = 16;
auto test = [num_bins] (float sparsity) {
CudaArrayIterForTest iter(sparsity);
IterativeDeviceDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
if (sparsity == 0.0) {
ASSERT_TRUE(m.IsDense());
} else {
ASSERT_FALSE(m.IsDense());
}
};
test(0.0);
test(0.1);
}
} // namespace data
} // namespace xgboost

View File

@ -1,17 +1,43 @@
#include <xgboost/c_api.h>
#include "helpers.h" #include "helpers.h"
#include "../../src/data/device_adapter.cuh" #include "../../src/data/device_adapter.cuh"
#include "../../src/data/device_dmatrix.h" #include "../../src/data/iterative_device_dmatrix.h"
namespace xgboost { namespace xgboost {
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
size_t cols, size_t batches)
: rows_{rows}, cols_{cols}, n_batches_{batches} {
XGProxyDMatrixCreate(&proxy_);
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
rng_->Device(0);
std::tie(batches_, interface_) =
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
this->Reset();
}
CudaArrayIterForTest::~CudaArrayIterForTest() { XGDMatrixFree(proxy_); }
int CudaArrayIterForTest::Next() {
if (iter_ == n_batches_) {
return 0;
}
XGDeviceQuantileDMatrixSetDataCudaArrayInterface(proxy_, batches_[iter_].c_str());
iter_++;
return 1;
}
size_t constexpr CudaArrayIterForTest::kRows;
size_t constexpr CudaArrayIterForTest::kCols;
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label, std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label,
bool float_label, bool float_label,
size_t classes) { size_t classes) {
std::vector<HostDeviceVector<float>> storage(cols_); CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
std::string arr = this->GenerateColumnarArrayInterface(&storage); auto m = std::make_shared<data::IterativeDeviceDMatrix>(
auto adapter = data::CudfAdapter(arr); &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
std::shared_ptr<DMatrix> m { 0, bins_);
new data::DeviceDMatrix{&adapter,
std::numeric_limits<float>::quiet_NaN(), 1, 256}};
return m; return m;
} }
} // namespace xgboost } // namespace xgboost

View File

@ -304,5 +304,51 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
HostDeviceVector<GradientPair> gpair(h_gpair); HostDeviceVector<GradientPair> gpair(h_gpair);
return gpair; return gpair;
} }
typedef void *DMatrixHandle; // NOLINT(*);
class CudaArrayIterForTest {
HostDeviceVector<float> data_;
size_t iter_ {0};
DMatrixHandle proxy_;
std::unique_ptr<RandomDataGenerator> rng_;
std::vector<std::string> batches_;
std::string interface_;
size_t rows_;
size_t cols_;
size_t n_batches_;
public:
size_t static constexpr kRows { 1000 };
size_t static constexpr kBatches { 100 };
size_t static constexpr kCols { 13 };
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
size_t cols = kCols, size_t batches = kBatches);
~CudaArrayIterForTest();
std::string AsArray() const {
return interface_;
}
int Next();
void Reset() {
iter_ = 0;
}
size_t Iter() const { return iter_; }
auto Proxy() -> decltype(proxy_) { return proxy_; }
};
typedef void *DataIterHandle; // NOLINT(*)
inline void Reset(DataIterHandle self) {
static_cast<CudaArrayIterForTest*>(self)->Reset();
}
inline int Next(DataIterHandle self) {
return static_cast<CudaArrayIterForTest*>(self)->Next();
}
} // namespace xgboost } // namespace xgboost
#endif #endif

View File

@ -76,15 +76,15 @@ TEST(GPUPredictor, EllpackTraining) {
.Bins(kBins) .Bins(kBins)
.Device(0) .Device(0)
.GenerateDeviceDMatrix(true); .GenerateDeviceDMatrix(true);
std::vector<HostDeviceVector<float>> storage(kCols); HostDeviceVector<float> storage(kRows * kCols);
auto columnar = RandomDataGenerator{kRows, kCols, 0.0} auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
.Device(0) .Device(0)
.GenerateColumnarArrayInterface(&storage); .GenerateArrayInterface(&storage);
auto adapter = data::CudfAdapter(columnar); auto adapter = data::CupyAdapter(columnar);
std::shared_ptr<DMatrix> p_full { std::shared_ptr<DMatrix> p_full {
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1) DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
}; };
TestTrainingPrediction(kRows, "gpu_hist", p_full, p_ellpack); TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
} }
TEST(GPUPredictor, ExternalMemoryTest) { TEST(GPUPredictor, ExternalMemoryTest) {

View File

@ -32,7 +32,8 @@ TEST(Predictor, PredictionCache) {
EXPECT_ANY_THROW(container.Entry(m)); EXPECT_ANY_THROW(container.Entry(m));
} }
void TestTrainingPrediction(size_t rows, std::string tree_method, void TestTrainingPrediction(size_t rows, size_t bins,
std::string tree_method,
std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_full,
std::shared_ptr<DMatrix> p_hist) { std::shared_ptr<DMatrix> p_hist) {
size_t constexpr kCols = 16; size_t constexpr kCols = 16;

View File

@ -52,7 +52,7 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
} }
// p_full and p_hist should come from the same data set. // p_full and p_hist should come from the same data set.
void TestTrainingPrediction(size_t rows, std::string tree_method, void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_full,
std::shared_ptr<DMatrix> p_hist); std::shared_ptr<DMatrix> p_hist);