Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves.
- Remove use of threaded iterator and IO queue.
- Remove `page_size`.
- Make sure the number of pages in memory is bounded.
- Make sure the cache can not be violated.
- Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
Jiaming Yuan 2021-07-16 12:33:31 +08:00 committed by GitHub
parent 2f524e9f41
commit bd1f3a38f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 1445 additions and 1391 deletions

View File

@ -37,18 +37,14 @@
#include "../src/data/simple_dmatrix.cc" #include "../src/data/simple_dmatrix.cc"
#include "../src/data/sparse_page_raw_format.cc" #include "../src/data/sparse_page_raw_format.cc"
#include "../src/data/ellpack_page.cc" #include "../src/data/ellpack_page.cc"
#include "../src/data/ellpack_page_source.cc"
#include "../src/data/gradient_index.cc" #include "../src/data/gradient_index.cc"
#include "../src/data/sparse_page_dmatrix.cc"
#include "../src/data/proxy_dmatrix.cc"
// prediction // prediction
#include "../src/predictor/predictor.cc" #include "../src/predictor/predictor.cc"
#include "../src/predictor/cpu_predictor.cc" #include "../src/predictor/cpu_predictor.cc"
#if DMLC_ENABLE_STD_THREAD
#include "../src/data/sparse_page_dmatrix.cc"
#include "../src/data/sparse_page_source.cc"
#endif
// trees // trees
#include "../src/tree/param.cc" #include "../src/tree/param.cc"
#include "../src/tree/tree_model.cc" #include "../src/tree/tree_model.cc"

View File

@ -223,19 +223,31 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
* - XGBCallbackDataIterNext * - XGBCallbackDataIterNext
* - XGDMatrixCreateFromDataIter * - XGDMatrixCreateFromDataIter
* *
* Another set is used by Quantile based DMatrix (used by hist algorithm) for reducing * Another set is used by external data iterator. It accept foreign data iterators as
* memory usage. Currently only GPU implementation is available. It accept foreign data * callbacks. There are 2 different senarios where users might want to pass in callbacks
* iterators as callbacks and works similar to external memory. For GPU Hist, the data is * instead of raw data. First it's the Quantile DMatrix used by GPU Hist. For this case,
* first compressed by quantile sketching then merged. This is particular useful for * the data is first compressed by quantile sketching then merged. This is particular
* distributed setting as it eliminates 2 copies of data. 1 by a `concat` from external * useful for distributed setting as it eliminates 2 copies of data. 1 by a `concat` from
* library to make the data into a blob for normal DMatrix initialization, another by the * external library to make the data into a blob for normal DMatrix initialization,
* internal CSR copy of DMatrix. Related functions are: * another by the internal CSR copy of DMatrix. The second use case is external memory
* support where users can pass a custom data iterator into XGBoost for loading data in
* batches. There are short notes on each of the use case in respected DMatrix factory
* function.
* *
* Related functions are:
*
* # Factory functions
* - `XGDMatrixCreateFromCallback` for external memory
* - `XGDeviceQuantileDMatrixCreateFromCallback` for quantile DMatrix
*
* # Proxy that callers can use to pass data to XGBoost
* - XGProxyDMatrixCreate * - XGProxyDMatrixCreate
* - XGDMatrixCallbackNext * - XGDMatrixCallbackNext
* - DataIterResetCallback * - DataIterResetCallback
* - XGProxyDMatrixSetDataCudaArrayInterface * - XGProxyDMatrixSetDataCudaArrayInterface
* - XGProxyDMatrixSetDataCudaColumnar * - XGProxyDMatrixSetDataCudaColumnar
* - XGProxyDMatrixSetDataDense
* - XGProxyDMatrixSetDataCSR
* - ... (data setters) * - ... (data setters)
*/ */
@ -308,17 +320,9 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
const char* cache_info, const char* cache_info,
DMatrixHandle *out); DMatrixHandle *out);
/* == Second set of callback functions, used by constructing Quantile based DMatrix. === /**
* * Second set of callback functions, used by constructing Quantile DMatrix or external
* Short note for how to use the second set of callback for GPU Hist tree method. * memory DMatrix using custom iterator.
*
* Step 0: Define a data iterator with 2 methods `reset`, and `next`.
* Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
* Step 2: Pass the iterator handle, proxy handle and 2 methods into
* `XGDeviceQuantileDMatrixCreateFromCallback`.
* Step 3: Call appropriate data setters in `next` functions.
*
* See test_iterative_device_dmatrix.cu or Python interface for examples.
*/ */
/*! /*!
@ -344,8 +348,53 @@ XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter); // NOLINT(
*/ */
XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*) XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*)
/*! /*!
* \brief Create a device DMatrix with data iterator. * \brief Create an external memory DMatrix with data iterator.
*
* Short note for how to use second set of callback for external memory data support:
*
* - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
* - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
* - Step 2: Pass the iterator handle, proxy handle and 2 methods into
* `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
* - Step 3: Call appropriate data setters in `next` functions.
*
* For example usage see demo/c-api/external-memory
*
* \param iter A handle to external data iterator.
* \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
* \param reset Callback function resetting the iterator state.
* \param next Callback function yielding the next batch of data.
* \param c_json_config JSON encoded parameters for DMatrix construction. Accepted fields are:
*
* - missing: Which value to represent missing value
* - cache_prefix: The path of cache file, caller must initialize all the directories in this path.
* - nthread (optional): Number of threads used for initializing DMatrix.
*
* \param out The created external memory DMatrix
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next,
char const* c_json_config,
DMatrixHandle *out);
/*!
* \brief Create a Quantile DMatrix with data iterator.
*
* Short note for how to use the second set of callback for GPU Hist tree method:
*
* - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
* - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
* - Step 2: Pass the iterator handle, proxy handle and 2 methods into
* `XGDeviceQuantileDMatrixCreateFromCallback`.
* - Step 3: Call appropriate data setters in `next` functions.
*
* See test_iterative_device_dmatrix.cu or Python interface for examples.
* *
* \param iter A handle to external data iterator. * \param iter A handle to external data iterator.
* \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`. * \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
@ -362,6 +411,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin, XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
DMatrixHandle *out); DMatrixHandle *out);
/*! /*!
* \brief Set data on a DMatrix proxy. * \brief Set data on a DMatrix proxy.
* *
@ -387,6 +437,33 @@ XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle,
const char *c_interface_str); const char *c_interface_str);
/*!
* \brief Set data on a DMatrix proxy.
*
* \param handle A DMatrix proxy created by XGProxyDMatrixCreate
* \param c_interface_str Null terminated JSON document string representation of array
* interface.
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle,
char const *c_interface_str);
/*!
* \brief Set data on a DMatrix proxy.
*
* \param handle A DMatrix proxy created by XGProxyDMatrixCreate
* \param indptr JSON encoded __array_interface__ to row pointer in CSR.
* \param indices JSON encoded __array_interface__ to column indices in CSR.
* \param values JSON encoded __array_interface__ to values in CSR..
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
char const *indices, char const *data,
bst_ulong ncol);
/* /*
* ==========================- End data callback APIs ========================== * ==========================- End data callback APIs ==========================
*/ */

View File

@ -171,9 +171,12 @@ class MetaInfo {
* \param that The other MetaInfo object. * \param that The other MetaInfo object.
* *
* \param accumulate_rows Whether rows need to be accumulated in this function. If * \param accumulate_rows Whether rows need to be accumulated in this function. If
* client code knows number of rows in advance, set this parameter to false. * client code knows number of rows in advance, set this
* parameter to false.
* \param check_column Whether the extend method should check the consistency of
* columns.
*/ */
void Extend(MetaInfo const& that, bool accumulate_rows); void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
private: private:
/*! \brief argsort of labels */ /*! \brief argsort of labels */
@ -211,14 +214,12 @@ struct BatchParam {
int gpu_id; int gpu_id;
/*! \brief Maximum number of bins per feature for histograms. */ /*! \brief Maximum number of bins per feature for histograms. */
int max_bin{0}; int max_bin{0};
/*! \brief Page size for external memory mode. */
size_t gpu_page_size;
BatchParam() = default; BatchParam() = default;
BatchParam(int32_t device, int32_t max_bin, size_t gpu_page_size = 0) BatchParam(int32_t device, int32_t max_bin)
: gpu_id{device}, max_bin{max_bin}, gpu_page_size{gpu_page_size} {} : gpu_id{device}, max_bin{max_bin} {}
inline bool operator!=(const BatchParam& other) const {
return gpu_id != other.gpu_id || max_bin != other.max_bin || bool operator!=(const BatchParam& other) const {
gpu_page_size != other.gpu_page_size; return gpu_id != other.gpu_id || max_bin != other.max_bin;
} }
}; };
@ -390,11 +391,12 @@ class GHistIndexMatrix;
template<typename T> template<typename T>
class BatchIteratorImpl { class BatchIteratorImpl {
public: public:
using iterator_category = std::forward_iterator_tag; // NOLINT
virtual ~BatchIteratorImpl() = default; virtual ~BatchIteratorImpl() = default;
virtual T& operator*() = 0;
virtual const T& operator*() const = 0; virtual const T& operator*() const = 0;
virtual void operator++() = 0; virtual BatchIteratorImpl& operator++() = 0;
virtual bool AtEnd() const = 0; virtual bool AtEnd() const = 0;
virtual std::shared_ptr<T const> Page() const = 0;
}; };
template<typename T> template<typename T>
@ -402,15 +404,12 @@ class BatchIterator {
public: public:
using iterator_category = std::forward_iterator_tag; // NOLINT using iterator_category = std::forward_iterator_tag; // NOLINT
explicit BatchIterator(BatchIteratorImpl<T>* impl) { impl_.reset(impl); } explicit BatchIterator(BatchIteratorImpl<T>* impl) { impl_.reset(impl); }
explicit BatchIterator(std::shared_ptr<BatchIteratorImpl<T>> impl) { impl_ = impl; }
void operator++() { BatchIterator &operator++() {
CHECK(impl_ != nullptr); CHECK(impl_ != nullptr);
++(*impl_); ++(*impl_);
} return *this;
T& operator*() {
CHECK(impl_ != nullptr);
return *(*impl_);
} }
const T& operator*() const { const T& operator*() const {
@ -428,6 +427,10 @@ class BatchIterator {
return impl_->AtEnd(); return impl_->AtEnd();
} }
std::shared_ptr<T const> Page() const {
return impl_->Page();
}
private: private:
std::shared_ptr<BatchIteratorImpl<T>> impl_; std::shared_ptr<BatchIteratorImpl<T>> impl_;
}; };
@ -499,8 +502,7 @@ class DMatrix {
static DMatrix* Load(const std::string& uri, static DMatrix* Load(const std::string& uri,
bool silent, bool silent,
bool load_row_split, bool load_row_split,
const std::string& file_format = "auto", const std::string& file_format = "auto");
size_t page_size = kPageSize);
/** /**
* \brief Creates a new DMatrix from an external data adapter. * \brief Creates a new DMatrix from an external data adapter.
@ -516,8 +518,7 @@ class DMatrix {
*/ */
template <typename AdapterT> template <typename AdapterT>
static DMatrix* Create(AdapterT* adapter, float missing, int nthread, static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
const std::string& cache_prefix = "", const std::string& cache_prefix = "");
size_t page_size = kPageSize);
/** /**
* \brief Create a new Quantile based DMatrix used for histogram based algorithm. * \brief Create a new Quantile based DMatrix used for histogram based algorithm.
@ -545,6 +546,31 @@ class DMatrix {
int nthread, int nthread,
int max_bin); int max_bin);
/**
* \brief Create an external memory DMatrix with callbacks.
*
* \tparam DataIterHandle External iterator type, defined in C API.
* \tparam DMatrixHandle DMatrix handle, defined in C API.
* \tparam DataIterResetCallback Callback for reset, prototype defined in C API.
* \tparam XGDMatrixCallbackNext Callback for next, prototype defined in C API.
*
* \param iter External data iterator
* \param proxy A hanlde to ProxyDMatrix
* \param reset Callback for reset
* \param next Callback for next
* \param missing Value that should be treated as missing.
* \param nthread number of threads used for initialization.
* \param cache Prefix of cache file path.
*
* \return A created external memory DMatrix.
*/
template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
static DMatrix *Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int32_t nthread, std::string cache);
virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0; virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
/*! \brief Number of rows per page in external memory. Approximately 100MB per page for /*! \brief Number of rows per page in external memory. Approximately 100MB per page for
* dataset with 100 features. */ * dataset with 100 features. */

View File

@ -29,8 +29,6 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
int gpu_id; int gpu_id;
// fail when gpu_id is invalid // fail when gpu_id is invalid
bool fail_on_invalid_gpu_id {false}; bool fail_on_invalid_gpu_id {false};
// gpu page size in external memory mode, 0 means using the default.
size_t gpu_page_size;
bool validate_parameters {false}; bool validate_parameters {false};
/*! /*!
@ -66,10 +64,6 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id) DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
.set_default(false) .set_default(false)
.describe("Fail with error when gpu_id is invalid."); .describe("Fail with error when gpu_id is invalid.");
DMLC_DECLARE_FIELD(gpu_page_size)
.set_default(0)
.set_lower_bound(0)
.describe("GPU page size when running in external memory mode.");
DMLC_DECLARE_FIELD(validate_parameters) DMLC_DECLARE_FIELD(validate_parameters)
.set_default(false) .set_default(false)
.describe("Enable checking whether parameters are used or not."); .describe("Enable checking whether parameters are used or not.");

View File

@ -190,6 +190,35 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
#endif #endif
// Create from data iterator // Create from data iterator
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next,
char const* c_json_config,
DMatrixHandle *out) {
API_BEGIN();
auto config = Json::Load(StringView{c_json_config});
float missing = get<Number const>(config["missing"]);
std::string cache = get<String const>(config["cache_prefix"]);
int32_t n_threads = omp_get_max_threads();
if (!IsA<Null>(config["nthread"])) {
n_threads = get<Integer const>(config["nthread"]);
}
*out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
iter, proxy, reset, next, missing, n_threads, cache)};
API_END();
}
XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int nthread,
int max_bin, DMatrixHandle *out) {
API_BEGIN();
*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)};
API_END();
}
XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) { XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
API_BEGIN(); API_BEGIN();
*out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);; *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
@ -221,15 +250,31 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle,
API_END(); API_END();
} }
XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback( XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle,
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, char const *c_interface_str) {
XGDMatrixCallbackNext *next, float missing, int nthread,
int max_bin, DMatrixHandle *out) {
API_BEGIN(); API_BEGIN();
*out = new std::shared_ptr<xgboost::DMatrix>{ CHECK_HANDLE();
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)}; auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
CHECK(p_m);
auto m = static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
CHECK(m) << "Current DMatrix type does not support set data.";
m->SetArrayData(c_interface_str);
API_END(); API_END();
} }
XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
char const *indices, char const *data,
xgboost::bst_ulong ncol) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
CHECK(p_m);
auto m = static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
CHECK(m) << "Current DMatrix type does not support set data.";
m->SetCSRData(indptr, indices, data, ncol, true);
API_END();
}
// End Create from data iterator // End Create from data iterator
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,

View File

@ -91,7 +91,6 @@ void PruneImpl(int device,
} }
float w = back.rmin - front.rmax; float w = back.rmin - front.rmax;
assert(w != 0);
auto budget = static_cast<float>(d_out.size()); auto budget = static_cast<float>(d_out.size());
assert(budget != 0); assert(budget != 0);
auto q = ((static_cast<float>(idx) * w) / (static_cast<float>(to) - 1.0f) + front.rmax); auto q = ((static_cast<float>(idx) * w) / (static_cast<float>(to) - 1.0f) + front.rmax);

View File

@ -22,11 +22,10 @@
#include "../common/threading_utils.h" #include "../common/threading_utils.h"
#include "../data/adapter.h" #include "../data/adapter.h"
#include "../data/iterative_device_dmatrix.h" #include "../data/iterative_device_dmatrix.h"
#include "file_iterator.h"
#if DMLC_ENABLE_STD_THREAD
#include "./sparse_page_source.h" #include "./sparse_page_source.h"
#include "./sparse_page_dmatrix.h" #include "./sparse_page_dmatrix.h"
#endif // DMLC_ENABLE_STD_THREAD
namespace dmlc { namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
@ -500,13 +499,17 @@ void MetaInfo::GetFeatureInfo(const char *field,
} }
} }
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) { void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_column) {
if (accumulate_rows) { if (accumulate_rows) {
this->num_row_ += that.num_row_; this->num_row_ += that.num_row_;
} }
if (this->num_col_ != 0) { if (this->num_col_ != 0) {
if (check_column) {
CHECK_EQ(this->num_col_, that.num_col_) CHECK_EQ(this->num_col_, that.num_col_)
<< "Number of columns must be consistent across batches."; << "Number of columns must be consistent across batches.";
} else {
this->num_col_ = std::max(this->num_col_, that.num_col_);
}
} }
this->num_col_ = that.num_col_; this->num_col_ = that.num_col_;
@ -630,11 +633,34 @@ DMatrix::~DMatrix() {
} }
} }
DMatrix *TryLoadBinary(std::string fname, bool silent) {
int magic;
std::unique_ptr<dmlc::Stream> fi(
dmlc::Stream::Create(fname.c_str(), "r", true));
if (fi != nullptr) {
common::PeekableInStream is(fi.get());
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
if (!DMLC_IO_NO_ENDIAN_SWAP) {
dmlc::ByteSwap(&magic, sizeof(magic), 1);
}
if (magic == data::SimpleDMatrix::kMagic) {
DMatrix *dmat = new data::SimpleDMatrix(&is);
if (!silent) {
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_
<< " matrix with " << dmat->Info().num_nonzero_
<< " entries loaded from " << fname;
}
return dmat;
}
}
}
return nullptr;
}
DMatrix* DMatrix::Load(const std::string& uri, DMatrix* DMatrix::Load(const std::string& uri,
bool silent, bool silent,
bool load_row_split, bool load_row_split,
const std::string& file_format, const std::string& file_format) {
const size_t page_size) {
std::string fname, cache_file; std::string fname, cache_file;
size_t dlm_pos = uri.find('#'); size_t dlm_pos = uri.find('#');
if (dlm_pos != std::string::npos) { if (dlm_pos != std::string::npos) {
@ -682,34 +708,33 @@ DMatrix* DMatrix::Load(const std::string& uri,
// legacy handling of binary data loading // legacy handling of binary data loading
if (file_format == "auto" && npart == 1) { if (file_format == "auto" && npart == 1) {
int magic; DMatrix *loaded = TryLoadBinary(fname, silent);
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true)); if (loaded) {
if (fi != nullptr) { return loaded;
common::PeekableInStream is(fi.get());
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
if (!DMLC_IO_NO_ENDIAN_SWAP) {
dmlc::ByteSwap(&magic, sizeof(magic), 1);
}
if (magic == data::SimpleDMatrix::kMagic) {
DMatrix* dmat = new data::SimpleDMatrix(&is);
if (!silent) {
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
}
return dmat;
}
}
} }
} }
std::unique_ptr<dmlc::Parser<uint32_t> > parser(
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
data::FileAdapter adapter(parser.get());
DMatrix* dmat {nullptr}; DMatrix* dmat {nullptr};
try { try {
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, if (cache_file.empty()) {
cache_file, page_size); std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart,
file_format.c_str()));
data::FileAdapter adapter(parser.get());
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(),
1, cache_file);
} else {
data::FileIterator iter{fname, uint32_t(partid), uint32_t(npart),
file_format};
dmat = new data::SparsePageDMatrix{
&iter,
iter.Proxy(),
data::fileiter::Reset,
data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(),
1,
cache_file};
}
} catch (dmlc::Error &e) { } catch (dmlc::Error &e) {
std::vector<std::string> splited = common::Split(fname, '#'); std::vector<std::string> splited = common::Split(fname, '#');
std::vector<std::string> args = common::Split(splited.front(), '?'); std::vector<std::string> args = common::Split(splited.front(), '?');
@ -734,10 +759,6 @@ DMatrix* DMatrix::Load(const std::string& uri,
LOG(FATAL) << "Encountered parser error:\n" << e.what(); LOG(FATAL) << "Encountered parser error:\n" << e.what();
} }
if (!silent) {
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
}
/* sync up number of features after matrix loaded. /* sync up number of features after matrix loaded.
* partitioned data will fail the train/val validation check * partitioned data will fail the train/val validation check
* since partitioned data not knowing the real number of features. */ * since partitioned data not knowing the real number of features. */
@ -769,12 +790,19 @@ DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
XGDMatrixCallbackNext *next, float missing, XGDMatrixCallbackNext *next, float missing,
int nthread, int nthread,
int max_bin) { int max_bin) {
#if defined(XGBOOST_USE_CUDA) return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing,
return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing, nthread, max_bin); nthread, max_bin);
#else }
common::AssertGPUSupport();
return nullptr; template <typename DataIterHandle, typename DMatrixHandle,
#endif typename DataIterResetCallback, typename XGDMatrixCallbackNext>
DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int32_t n_threads,
std::string cache) {
return new data::SparsePageDMatrix(iter, proxy, reset, next, missing, n_threads,
cache);
} }
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle, template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
@ -783,49 +811,42 @@ template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
XGDMatrixCallbackNext *next, float missing, int nthread, XGDMatrixCallbackNext *next, float missing, int nthread,
int max_bin); int max_bin);
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
DataIterResetCallback, XGDMatrixCallbackNext>(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
template <typename AdapterT> template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size) { const std::string& cache_prefix) {
if (cache_prefix.length() == 0) {
// Data split mode is fixed to be row right now.
return new data::SimpleDMatrix(adapter, missing, nthread); return new data::SimpleDMatrix(adapter, missing, nthread);
} else {
#if DMLC_ENABLE_STD_THREAD
return new data::SparsePageDMatrix(adapter, missing, nthread, cache_prefix,
page_size);
#else
LOG(FATAL) << "External memory is not enabled in mingw";
return nullptr;
#endif // DMLC_ENABLE_STD_THREAD
}
} }
template DMatrix* DMatrix::Create<data::DenseAdapter>( template DMatrix* DMatrix::Create<data::DenseAdapter>(
data::DenseAdapter* adapter, float missing, int nthread, data::DenseAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::ArrayAdapter>( template DMatrix* DMatrix::Create<data::ArrayAdapter>(
data::ArrayAdapter* adapter, float missing, int nthread, data::ArrayAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSRAdapter>( template DMatrix* DMatrix::Create<data::CSRAdapter>(
data::CSRAdapter* adapter, float missing, int nthread, data::CSRAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSCAdapter>( template DMatrix* DMatrix::Create<data::CSCAdapter>(
data::CSCAdapter* adapter, float missing, int nthread, data::CSCAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::DataTableAdapter>( template DMatrix* DMatrix::Create<data::DataTableAdapter>(
data::DataTableAdapter* adapter, float missing, int nthread, data::DataTableAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::FileAdapter>( template DMatrix* DMatrix::Create<data::FileAdapter>(
data::FileAdapter* adapter, float missing, int nthread, data::FileAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>( template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(
data::CSRArrayAdapter* adapter, float missing, int nthread, data::CSRArrayAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix * template DMatrix *
DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
XGBoostBatchCSR> *adapter, XGBoostBatchCSR> *adapter,
float missing, int nthread, const std::string &cache_prefix, float missing, int nthread, const std::string &cache_prefix);
size_t page_size);
SparsePage SparsePage::GetTranspose(int num_columns) const { SparsePage SparsePage::GetTranspose(int num_columns) const {
SparsePage transpose; SparsePage transpose;
@ -1044,6 +1065,8 @@ SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthrea
template uint64_t template uint64_t
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread); SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
template uint64_t template uint64_t
SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
template uint64_t template uint64_t
SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread); SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread);

View File

@ -167,7 +167,7 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
template <typename AdapterT> template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size) { const std::string& cache_prefix) {
CHECK_EQ(cache_prefix.size(), 0) CHECK_EQ(cache_prefix.size(), 0)
<< "Device memory construction is not currently supported with external " << "Device memory construction is not currently supported with external "
"memory."; "memory.";
@ -176,8 +176,8 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
template DMatrix* DMatrix::Create<data::CudfAdapter>( template DMatrix* DMatrix::Create<data::CudfAdapter>(
data::CudfAdapter* adapter, float missing, int nthread, data::CudfAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CupyAdapter>( template DMatrix* DMatrix::Create<data::CupyAdapter>(
data::CupyAdapter* adapter, float missing, int nthread, data::CupyAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size); const std::string& cache_prefix);
} // namespace xgboost } // namespace xgboost

View File

@ -122,6 +122,7 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
dmat->Info().feature_types.SetDevice(param.gpu_id); dmat->Info().feature_types.SetDevice(param.gpu_id);
auto ft = dmat->Info().feature_types.ConstDeviceSpan(); auto ft = dmat->Info().feature_types.ConstDeviceSpan();
monitor_.Start("BinningCompression"); monitor_.Start("BinningCompression");
CHECK(dmat->SingleColBlock());
for (const auto& batch : dmat->GetBatches<SparsePage>()) { for (const auto& batch : dmat->GetBatches<SparsePage>()) {
CreateHistIndices(param.gpu_id, batch, ft); CreateHistIndices(param.gpu_id, batch, ft);
} }
@ -301,9 +302,8 @@ struct CopyPage {
// The number of elements to skip. // The number of elements to skip.
size_t offset; size_t offset;
CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset) CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset)
: cbw{dst->NumSymbols()}, : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
dst_data_d{dst->gidx_buffer.DevicePointer()},
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
offset(offset) {} offset(offset) {}
@ -314,7 +314,8 @@ struct CopyPage {
}; };
// Copy the data from the given EllpackPage to the current page. // Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) { size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
size_t offset) {
monitor_.Start("Copy"); monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride; size_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
@ -351,7 +352,7 @@ struct CompactPage {
size_t base_rowid; size_t base_rowid;
size_t row_stride; size_t row_stride;
CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src,
common::Span<size_t> row_indexes) common::Span<size_t> row_indexes)
: cbw{dst->NumSymbols()}, : cbw{dst->NumSymbols()},
dst_data_d{dst->gidx_buffer.DevicePointer()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
@ -374,7 +375,7 @@ struct CompactPage {
}; };
// Compacts the data from the given EllpackPage into the current page. // Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
common::Span<size_t> row_indexes) { common::Span<size_t> row_indexes) {
monitor_.Start("Compact"); monitor_.Start("Compact");
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
@ -459,7 +460,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
gidx_buffer.DevicePointer(), row_ptrs.data().get(), gidx_buffer.DevicePointer(), row_ptrs.data().get(),
entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
device_accessor.feature_segments.data(), feature_types, device_accessor.feature_segments.data(), feature_types,
row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride, batch_row_begin, batch_nrows, row_stride,
null_gidx_value); null_gidx_value);
} }
} }

View File

@ -164,7 +164,7 @@ class EllpackPageImpl {
* @param offset The number of elements to skip before copying. * @param offset The number of elements to skip before copying.
* @returns The number of elements copied. * @returns The number of elements copied.
*/ */
size_t Copy(int device, EllpackPageImpl* page, size_t offset); size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
/*! \brief Compact the given ELLPACK page into the current page. /*! \brief Compact the given ELLPACK page into the current page.
* *
@ -172,7 +172,7 @@ class EllpackPageImpl {
* @param page The ELLPACK page to compact from. * @param page The ELLPACK page to compact from.
* @param row_indexes Row indexes for the compacted page. * @param row_indexes Row indexes for the compacted page.
*/ */
void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes); void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
/*! \return Number of instances in the page. */ /*! \return Number of instances in the page. */

View File

@ -1,24 +0,0 @@
/*!
* Copyright 2019 XGBoost contributors
*/
#ifndef XGBOOST_USE_CUDA
#include <dmlc/base.h>
#if DMLC_ENABLE_STD_THREAD
#include "ellpack_page_source.h"
#include <xgboost/data.h>
namespace xgboost {
namespace data {
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
const std::string& cache_info,
const BatchParam& param) noexcept(false) {
LOG(FATAL)
<< "Internal Error: "
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
}
} // namespace data
} // namespace xgboost
#endif // DMLC_ENABLE_STD_THREAD
#endif // XGBOOST_USE_CUDA

View File

@ -1,89 +1,24 @@
/*! /*!
* Copyright 2019 XGBoost contributors * Copyright 2019-2021 XGBoost contributors
*/ */
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "../common/hist_util.cuh"
#include "ellpack_page.cuh" #include "ellpack_page.cuh"
#include "ellpack_page_source.h" #include "ellpack_page_source.h"
#include "sparse_page_source.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
void EllpackPageSource::Fetch() {
// Build the quantile sketch across the whole input data, then use the histogram cuts to compress if (!this->ReadCache()) {
// each CSR page, and write the accumulated ELLPACK pages to disk. auto const &csr = source_->Page();
EllpackPageSource::EllpackPageSource(DMatrix* dmat, this->page_.reset(new EllpackPage{});
const std::string& cache_info, auto *impl = this->page_->Impl();
const BatchParam& param) noexcept(false) { *impl = EllpackPageImpl(param_.gpu_id, *cuts_, *csr, is_dense_, row_stride_,
cache_info_ = ParseCacheInfo(cache_info, kPageType_); feature_types_);
for (auto file : cache_info_.name_shards) { page_->SetBaseRowId(csr->base_rowid);
CheckCacheFileExists(file); this->WriteCache();
}
if (param.gpu_page_size > 0) {
page_size_ = param.gpu_page_size;
}
monitor_.Init("ellpack_page_source");
dh::safe_cuda(cudaSetDevice(param.gpu_id));
monitor_.Start("Quantiles");
size_t row_stride = GetRowStride(dmat);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.Stop("Quantiles");
monitor_.Start("WriteEllpackPages");
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
monitor_.Stop("WriteEllpackPages");
external_prefetcher_.reset(
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
}
// Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
const common::HistogramCuts& cuts,
const std::string& cache_info,
size_t row_stride) const {
auto cinfo = ParseCacheInfo(cache_info, kPageType_);
const size_t extra_buffer_capacity = 6;
SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
extra_buffer_capacity);
std::shared_ptr<EllpackPage> page;
SparsePage temp_host_page;
writer.Alloc(&page);
auto* impl = page->Impl();
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
size_t bytes_write = 0;
double tstart = dmlc::GetTime();
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
temp_host_page.Push(batch);
size_t mem_cost_bytes =
EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
if (mem_cost_bytes >= page_size_) {
bytes_write += mem_cost_bytes;
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
row_stride, ft);
writer.PushWrite(std::move(page));
writer.Alloc(&page);
impl = page->Impl();
temp_host_page.Clear();
double tdiff = dmlc::GetTime() - tstart;
LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
} }
} }
if (temp_host_page.Size() != 0) {
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
row_stride, ft);
writer.PushWrite(std::move(page));
}
}
} // namespace data } // namespace data
} // namespace xgboost } // namespace xgboost

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2019 by XGBoost Contributors * Copyright 2019-2021 by XGBoost Contributors
*/ */
#ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_ #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
@ -8,57 +8,44 @@
#include <xgboost/data.h> #include <xgboost/data.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include "../common/timer.h" #include "../common/common.h"
#include "../common/hist_util.h" #include "../common/hist_util.h"
#include "sparse_page_source.h" #include "sparse_page_source.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
/*! class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
* \brief External memory data source for ELLPACK format. bool is_dense_;
* size_t row_stride_;
*/ BatchParam param_;
class EllpackPageSource { common::Span<FeatureType const> feature_types_;
std::unique_ptr<common::HistogramCuts> cuts_;
public: public:
/*! EllpackPageSource(
* \brief Create source from cache files the cache_prefix. float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
* \param cache_prefix The prefix of cache we want to solve. std::shared_ptr<Cache> cache, BatchParam param,
*/ std::unique_ptr<common::HistogramCuts> cuts, bool is_dense,
explicit EllpackPageSource(DMatrix* dmat, size_t row_stride, common::Span<FeatureType const> feature_types,
const std::string& cache_info, std::shared_ptr<SparsePageSource> source)
const BatchParam& param) noexcept(false); : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache),
is_dense_{is_dense}, row_stride_{row_stride}, param_{param},
BatchSet<EllpackPage> GetBatchSet() { feature_types_{feature_types}, cuts_{std::move(cuts)} {
auto begin_iter = BatchIterator<EllpackPage>( this->source_ = source;
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>, this->Fetch();
EllpackPage>(external_prefetcher_.get()));
return BatchSet<EllpackPage>(begin_iter);
} }
~EllpackPageSource() { void Fetch() final;
external_prefetcher_.reset();
for (auto file : cache_info_.name_shards) {
TryDeleteCacheFile(file);
}
}
private:
void WriteEllpackPages(int device, DMatrix* dmat,
const common::HistogramCuts& cuts,
const std::string& cache_info,
size_t row_stride) const;
/*! \brief The page type string for ELLPACK. */
const std::string kPageType_{".ellpack.page"};
size_t page_size_{DMatrix::kPageSize};
common::Monitor monitor_;
std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
CacheInfo cache_info_;
}; };
#if !defined(XGBOOST_USE_CUDA)
inline void EllpackPageSource::Fetch() {
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace data } // namespace data
} // namespace xgboost } // namespace xgboost

115
src/data/file_iterator.h Normal file
View File

@ -0,0 +1,115 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#ifndef XGBOOST_DATA_FILE_ITERATOR_H_
#define XGBOOST_DATA_FILE_ITERATOR_H_
#include <string>
#include <memory>
#include <vector>
#include <utility>
#include "dmlc/data.h"
#include "xgboost/c_api.h"
#include "xgboost/json.h"
#include "array_interface.h"
namespace xgboost {
namespace data {
/**
* An iterator for implementing external memory support with file inputs. Users of
* external memory are encouraged to define their own file parsers/loaders so this one is
* just here for compatibility with old versions of XGBoost and CLI interface.
*/
class FileIterator {
// uri of input file, encodes parameters about whether it's 1-based index etc. dmlc
// parser will decode these information.
std::string uri_;
// Equals to rank_id in distributed training, used to split file into parts for each
// worker.
uint32_t part_idx_;
// Equals to total number of workers.
uint32_t n_parts_;
// Format of the input file, like "libsvm".
std::string type_;
DMatrixHandle proxy_;
std::unique_ptr<dmlc::Parser<uint32_t>> parser_;
// Temporary reference to stage the data.
dmlc::RowBlock<uint32_t, float> row_block_;
// Storage for the array interface strings.
std::string indptr_;
std::string values_;
std::string indices_;
public:
FileIterator(std::string uri, unsigned part_index, unsigned num_parts,
std::string type)
: uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts},
type_{std::move(type)} {
XGProxyDMatrixCreate(&proxy_);
}
~FileIterator() {
XGDMatrixFree(proxy_);
}
int Next() {
CHECK(parser_);
if (parser_->Next()) {
row_block_ = parser_->Value();
indptr_ = MakeArrayInterface(row_block_.offset, row_block_.size + 1);
values_ = MakeArrayInterface(row_block_.value,
row_block_.offset[row_block_.size]);
indices_ = MakeArrayInterface(row_block_.index,
row_block_.offset[row_block_.size]);
size_t n_columns = *std::max_element(
row_block_.index,
row_block_.index + row_block_.offset[row_block_.size]);
// dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
// this condition and just add 1 to n_columns
n_columns += 1;
XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(),
values_.c_str(), n_columns);
if (row_block_.label) {
XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1);
}
if (row_block_.qid) {
XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1);
}
if (row_block_.weight) {
XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1);
}
// Continue iteration
return true;
} else {
// Stop iteration
return false;
}
}
auto Proxy() -> decltype(proxy_) { return proxy_; }
void Reset() {
CHECK(!type_.empty());
parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_,
n_parts_, type_.c_str()));
}
};
namespace fileiter {
inline void Reset(DataIterHandle self) {
static_cast<FileIterator*>(self)->Reset();
}
inline int Next(DataIterHandle self) {
return static_cast<FileIterator*>(self)->Next();
}
} // namespace fileiter
} // namespace data
} // namespace xgboost
#endif // XGBOOST_DATA_FILE_ITERATOR_H_

View File

@ -143,7 +143,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
proxy->Info().num_row_ = num_rows(); proxy->Info().num_row_ = num_rows();
proxy->Info().num_col_ = cols; proxy->Info().num_col_ = cols;
if (batches != 1) { if (batches != 1) {
this->info_.Extend(std::move(proxy->Info()), false); this->info_.Extend(std::move(proxy->Info()), false, true);
} }
n_batches_for_verification++; n_batches_for_verification++;
} }
@ -163,7 +163,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) { BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
CHECK(page_); CHECK(page_);
auto begin_iter = auto begin_iter =
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_.get())); BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_));
return BatchSet<EllpackPage>(begin_iter); return BatchSet<EllpackPage>(begin_iter);
} }
} // namespace data } // namespace data

View File

@ -14,6 +14,7 @@
#include "xgboost/data.h" #include "xgboost/data.h"
#include "xgboost/c_api.h" #include "xgboost/c_api.h"
#include "proxy_dmatrix.h" #include "proxy_dmatrix.h"
#include "simple_batch_iterator.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
@ -36,9 +37,10 @@ class IterativeDeviceDMatrix : public DMatrix {
XGDMatrixCallbackNext *next, float missing, XGDMatrixCallbackNext *next, float missing,
int nthread, int max_bin) int nthread, int max_bin)
: proxy_{proxy}, reset_{reset}, next_{next} { : proxy_{proxy}, reset_{reset}, next_{next} {
batch_param_ = BatchParam{0, max_bin, 0}; batch_param_ = BatchParam{0, max_bin};
this->Initialize(iter, missing, nthread); this->Initialize(iter, missing, nthread);
} }
~IterativeDeviceDMatrix() override = default;
bool EllpackExists() const override { return true; } bool EllpackExists() const override { return true; }
bool SparsePageExists() const override { return false; } bool SparsePageExists() const override { return false; }
@ -74,6 +76,18 @@ class IterativeDeviceDMatrix : public DMatrix {
return info_; return info_;
} }
}; };
#if !defined(XGBOOST_USE_CUDA)
inline void IterativeDeviceDMatrix::Initialize(DataIterHandle iter, float missing, int nthread) {
common::AssertGPUSupport();
}
inline BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
common::AssertGPUSupport();
auto begin_iter =
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_));
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace data } // namespace data
} // namespace xgboost } // namespace xgboost

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2020 XGBoost contributors * Copyright 2020-2021 XGBoost contributors
*/ */
#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_ #ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
#define XGBOOST_DATA_PROXY_DMATRIX_H_ #define XGBOOST_DATA_PROXY_DMATRIX_H_

View File

@ -1,10 +1,13 @@
/*! /*!
* Copyright 2019 XGBoost contributors * Copyright 2019-2021 XGBoost contributors
*/ */
#ifndef XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_ #ifndef XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_
#define XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_ #define XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_
#include <xgboost/data.h> #include <memory>
#include <utility>
#include "xgboost/data.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
@ -12,20 +15,21 @@ namespace data {
template<typename T> template<typename T>
class SimpleBatchIteratorImpl : public BatchIteratorImpl<T> { class SimpleBatchIteratorImpl : public BatchIteratorImpl<T> {
public: public:
explicit SimpleBatchIteratorImpl(T* page) : page_(page) {} explicit SimpleBatchIteratorImpl(std::shared_ptr<T const> page) : page_(std::move(page)) {}
T& operator*() override {
CHECK(page_ != nullptr);
return *page_;
}
const T& operator*() const override { const T& operator*() const override {
CHECK(page_ != nullptr); CHECK(page_ != nullptr);
return *page_; return *page_;
} }
void operator++() override { page_ = nullptr; } SimpleBatchIteratorImpl &operator++() override {
page_ = nullptr;
return *this;
}
bool AtEnd() const override { return page_ == nullptr; } bool AtEnd() const override { return page_ == nullptr; }
std::shared_ptr<T const> Page() const override { return page_; }
private: private:
T* page_{nullptr}; std::shared_ptr<T const> page_{nullptr};
}; };
} // namespace data } // namespace data

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2014~2020 by Contributors * Copyright 2014~2021 by Contributors
* \file simple_dmatrix.cc * \file simple_dmatrix.cc
* \brief the input data structure for gradient boosting * \brief the input data structure for gradient boosting
* \author Tianqi Chen * \author Tianqi Chen
@ -27,7 +27,7 @@ const MetaInfo& SimpleDMatrix::Info() const { return info_; }
DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) { DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
auto out = new SimpleDMatrix; auto out = new SimpleDMatrix;
SparsePage& out_page = out->sparse_page_; SparsePage& out_page = *out->sparse_page_;
for (auto const &page : this->GetBatches<SparsePage>()) { for (auto const &page : this->GetBatches<SparsePage>()) {
auto batch = page.GetView(); auto batch = page.GetView();
auto& h_data = out_page.data.HostVector(); auto& h_data = out_page.data.HostVector();
@ -48,17 +48,17 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() { BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
// since csr is the default data structure so `source_` is always available. // since csr is the default data structure so `source_` is always available.
auto begin_iter = BatchIterator<SparsePage>( auto begin_iter = BatchIterator<SparsePage>(
new SimpleBatchIteratorImpl<SparsePage>(&sparse_page_)); new SimpleBatchIteratorImpl<SparsePage>(sparse_page_));
return BatchSet<SparsePage>(begin_iter); return BatchSet<SparsePage>(begin_iter);
} }
BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches() { BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches() {
// column page doesn't exist, generate it // column page doesn't exist, generate it
if (!column_page_) { if (!column_page_) {
column_page_.reset(new CSCPage(sparse_page_.GetTranspose(info_.num_col_))); column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_)));
} }
auto begin_iter = auto begin_iter =
BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_.get())); BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
return BatchSet<CSCPage>(begin_iter); return BatchSet<CSCPage>(begin_iter);
} }
@ -66,11 +66,11 @@ BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches() {
// Sorted column page doesn't exist, generate it // Sorted column page doesn't exist, generate it
if (!sorted_column_page_) { if (!sorted_column_page_) {
sorted_column_page_.reset( sorted_column_page_.reset(
new SortedCSCPage(sparse_page_.GetTranspose(info_.num_col_))); new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_)));
sorted_column_page_->SortRows(); sorted_column_page_->SortRows();
} }
auto begin_iter = BatchIterator<SortedCSCPage>( auto begin_iter = BatchIterator<SortedCSCPage>(
new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_.get())); new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_));
return BatchSet<SortedCSCPage>(begin_iter); return BatchSet<SortedCSCPage>(begin_iter);
} }
@ -86,7 +86,7 @@ BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(const BatchParam& param)
batch_param_ = param; batch_param_ = param;
} }
auto begin_iter = auto begin_iter =
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_.get())); BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_));
return BatchSet<EllpackPage>(begin_iter); return BatchSet<EllpackPage>(begin_iter);
} }
@ -100,7 +100,7 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(const BatchParam& par
batch_param_ = param; batch_param_ = param;
} }
auto begin_iter = BatchIterator<GHistIndexMatrix>( auto begin_iter = BatchIterator<GHistIndexMatrix>(
new SimpleBatchIteratorImpl<GHistIndexMatrix>(gradient_index_.get())); new SimpleBatchIteratorImpl<GHistIndexMatrix>(gradient_index_));
return BatchSet<GHistIndexMatrix>(begin_iter); return BatchSet<GHistIndexMatrix>(begin_iter);
} }
@ -110,8 +110,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
uint64_t default_max = std::numeric_limits<uint64_t>::max(); uint64_t default_max = std::numeric_limits<uint64_t>::max();
uint64_t last_group_id = default_max; uint64_t last_group_id = default_max;
bst_uint group_size = 0; bst_uint group_size = 0;
auto& offset_vec = sparse_page_.offset.HostVector(); auto& offset_vec = sparse_page_->offset.HostVector();
auto& data_vec = sparse_page_.data.HostVector(); auto& data_vec = sparse_page_->data.HostVector();
uint64_t inferred_num_columns = 0; uint64_t inferred_num_columns = 0;
uint64_t total_batch_size = 0; uint64_t total_batch_size = 0;
// batch_size is either number of rows or cols, depending on data layout // batch_size is either number of rows or cols, depending on data layout
@ -120,7 +120,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
// Iterate over batches of input data // Iterate over batches of input data
while (adapter->Next()) { while (adapter->Next()) {
auto& batch = adapter->Value(); auto& batch = adapter->Value();
auto batch_max_columns = sparse_page_.Push(batch, missing, nthread); auto batch_max_columns = sparse_page_->Push(batch, missing, nthread);
inferred_num_columns = std::max(batch_max_columns, inferred_num_columns); inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
total_batch_size += batch.Size(); total_batch_size += batch.Size();
// Append meta information if available // Append meta information if available
@ -203,8 +203,8 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
CHECK(in_stream->Read(&tmagic)) << "invalid input file format"; CHECK(in_stream->Read(&tmagic)) << "invalid input file format";
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch"; CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
info_.LoadBinary(in_stream); info_.LoadBinary(in_stream);
in_stream->Read(&sparse_page_.offset.HostVector()); in_stream->Read(&sparse_page_->offset.HostVector());
in_stream->Read(&sparse_page_.data.HostVector()); in_stream->Read(&sparse_page_->data.HostVector());
} }
void SimpleDMatrix::SaveToLocalFile(const std::string& fname) { void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
@ -212,8 +212,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
int tmagic = kMagic; int tmagic = kMagic;
fo->Write(tmagic); fo->Write(tmagic);
info_.SaveBinary(fo.get()); info_.SaveBinary(fo.get());
fo->Write(sparse_page_.offset.HostVector()); fo->Write(sparse_page_->offset.HostVector());
fo->Write(sparse_page_.data.HostVector()); fo->Write(sparse_page_->data.HostVector());
} }
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,

View File

@ -28,7 +28,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
CHECK(!adapter->Next()); CHECK(!adapter->Next());
info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(), info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(),
missing, &sparse_page_); missing, sparse_page_.get());
info_.num_col_ = adapter->NumColumns(); info_.num_col_ = adapter->NumColumns();
info_.num_row_ = adapter->NumRows(); info_.num_row_ = adapter->NumRows();
// Synchronise worker columns // Synchronise worker columns

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2015 by Contributors * Copyright 2015-2021 by Contributors
* \file simple_dmatrix.h * \file simple_dmatrix.h
* \brief In-memory version of DMatrix. * \brief In-memory version of DMatrix.
* \author Tianqi Chen * \author Tianqi Chen
@ -47,11 +47,12 @@ class SimpleDMatrix : public DMatrix {
BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override; BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override;
MetaInfo info_; MetaInfo info_;
SparsePage sparse_page_; // Primary storage type // Primary storage type
std::unique_ptr<CSCPage> column_page_; std::shared_ptr<SparsePage> sparse_page_ = std::make_shared<SparsePage>();
std::unique_ptr<SortedCSCPage> sorted_column_page_; std::shared_ptr<CSCPage> column_page_;
std::unique_ptr<EllpackPage> ellpack_page_; std::shared_ptr<SortedCSCPage> sorted_column_page_;
std::unique_ptr<GHistIndexMatrix> gradient_index_; std::shared_ptr<EllpackPage> ellpack_page_;
std::shared_ptr<GHistIndexMatrix> gradient_index_;
BatchParam batch_param_; BatchParam batch_param_;
bool EllpackExists() const override { bool EllpackExists() const override {

View File

@ -1,59 +1,147 @@
/*! /*!
* Copyright 2014-2020 by Contributors * Copyright 2014-2021 by Contributors
* \file sparse_page_dmatrix.cc * \file sparse_page_dmatrix.cc
* \brief The external memory version of Page Iterator. * \brief The external memory version of Page Iterator.
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#include <dmlc/base.h>
#include <dmlc/timer.h>
#if DMLC_ENABLE_STD_THREAD
#include "./sparse_page_dmatrix.h" #include "./sparse_page_dmatrix.h"
#include "./simple_batch_iterator.h" #include "./simple_batch_iterator.h"
#include "gradient_index.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
MetaInfo& SparsePageDMatrix::Info() { MetaInfo &SparsePageDMatrix::Info() { return info_; }
return row_source_->info;
const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy_handle,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int32_t nthreads, std::string cache_prefix)
: proxy_{proxy_handle}, iter_{iter_handle}, reset_{reset}, next_{next}, missing_{missing},
nthreads_{nthreads}, cache_prefix_{std::move(cache_prefix)} {
cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
if (rabit::IsDistributed()) {
cache_prefix_ += ("-r" + std::to_string(rabit::GetRank()));
}
DMatrixProxy *proxy = MakeProxy(proxy_);
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
iter_, reset_, next_};
uint32_t n_batches = 0;
size_t n_features = 0;
size_t n_samples = 0;
size_t nnz = 0;
auto num_rows = [&]() {
return HostAdapterDispatch(
proxy, [](auto const &value) { return value.NumRows(); });
};
auto num_cols = [&]() {
return HostAdapterDispatch(
proxy, [](auto const &value) { return value.NumCols(); });
};
// the proxy is iterated together with the sparse page source so we can obtain all
// information in 1 pass.
for (auto const &page : this->GetRowBatchesImpl()) {
this->info_.Extend(std::move(proxy->Info()), false, false);
n_features = std::max(n_features, num_cols());
n_samples += num_rows();
nnz += page.data.Size();
n_batches++;
} }
const MetaInfo& SparsePageDMatrix::Info() const { iter.Reset();
return row_source_->info;
this->n_batches_ = n_batches;
this->info_.num_row_ = n_samples;
this->info_.num_col_ = n_features;
this->info_.num_nonzero_ = nnz;
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
CHECK_NE(info_.num_col_, 0);
}
void SparsePageDMatrix::InitializeSparsePage() {
auto id = MakeCache(this, ".row.page", cache_prefix_, &cache_info_);
// Don't use proxy DMatrix once this is already initialized, this allows users to
// release the iterator and data.
if (cache_info_.at(id)->written) {
CHECK(sparse_page_source_);
sparse_page_source_->Reset();
return;
}
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
iter_, reset_, next_};
DMatrixProxy *proxy = MakeProxy(proxy_);
sparse_page_source_.reset(); // clear before creating new one to prevent conflicts.
sparse_page_source_ = std::make_shared<SparsePageSource>(
iter, proxy, this->missing_, this->nthreads_, this->info_.num_col_,
this->n_batches_, cache_info_.at(id));
}
BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl() {
this->InitializeSparsePage();
auto begin_iter = BatchIterator<SparsePage>(sparse_page_source_);
return BatchSet<SparsePage>(BatchIterator<SparsePage>(begin_iter));
} }
BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() { BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
return row_source_->GetBatchSet(); return this->GetRowBatchesImpl();
} }
BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() { BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
// Lazily instantiate auto id = MakeCache(this, ".col.page", cache_prefix_, &cache_info_);
CHECK_NE(this->Info().num_col_, 0);
this->InitializeSparsePage();
if (!column_source_) { if (!column_source_) {
column_source_.reset(new CSCPageSource(this, cache_info_)); column_source_ = std::make_shared<CSCPageSource>(
this->missing_, this->nthreads_, this->Info().num_col_,
this->n_batches_, cache_info_.at(id), sparse_page_source_);
} else {
column_source_->Reset();
} }
return column_source_->GetBatchSet(); auto begin_iter = BatchIterator<CSCPage>(column_source_);
return BatchSet<CSCPage>(BatchIterator<CSCPage>(begin_iter));
} }
BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() { BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
// Lazily instantiate auto id = MakeCache(this, ".sorted.col.page", cache_prefix_, &cache_info_);
CHECK_NE(this->Info().num_col_, 0);
this->InitializeSparsePage();
if (!sorted_column_source_) { if (!sorted_column_source_) {
sorted_column_source_.reset(new SortedCSCPageSource(this, cache_info_)); sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
this->missing_, this->nthreads_, this->Info().num_col_,
this->n_batches_, cache_info_.at(id), sparse_page_source_);
} else {
sorted_column_source_->Reset();
} }
return sorted_column_source_->GetBatchSet(); auto begin_iter = BatchIterator<SortedCSCPage>(sorted_column_source_);
return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(begin_iter));
} }
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) { BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam& param) {
CHECK_GE(param.gpu_id, 0);
CHECK_GE(param.max_bin, 2); CHECK_GE(param.max_bin, 2);
// Lazily instantiate // External memory is not support
if (!ellpack_source_ || (batch_param_ != param && param != BatchParam{})) { if (!ghist_index_source_ || (param != batch_param_ && param != BatchParam{})) {
ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param)); this->InitializeSparsePage();
ghist_index_source_.reset(new GHistIndexMatrix{this, param.max_bin});
batch_param_ = param; batch_param_ = param;
} }
return ellpack_source_->GetBatchSet(); this->InitializeSparsePage();
auto begin_iter = BatchIterator<GHistIndexMatrix>(
new SimpleBatchIteratorImpl<GHistIndexMatrix>(ghist_index_source_));
return BatchSet<GHistIndexMatrix>(begin_iter);
} }
#if !defined(XGBOOST_USE_CUDA)
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
common::AssertGPUSupport();
auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace data } // namespace data
} // namespace xgboost } // namespace xgboost
#endif // DMLC_ENABLE_STD_THREAD

View File

@ -0,0 +1,46 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include "sparse_page_source.h"
#include "../common/hist_util.cuh"
#include "ellpack_page.cuh"
#include "sparse_page_dmatrix.h"
namespace xgboost {
namespace data {
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
CHECK_GE(param.gpu_id, 0);
CHECK_GE(param.max_bin, 2);
auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
size_t row_stride = 0;
this->InitializeSparsePage();
if (!cache_info_.at(id)->written || (batch_param_ != param && param != BatchParam{})) {
// reinitialize the cache
cache_info_.erase(id);
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
std::unique_ptr<common::HistogramCuts> cuts;
cuts.reset(new common::HistogramCuts{
common::DeviceSketch(param.gpu_id, this, param.max_bin, 0)});
this->InitializeSparsePage(); // reset after use.
row_stride = GetRowStride(this);
this->InitializeSparsePage(); // reset after use.
CHECK_NE(row_stride, 0);
batch_param_ = param;
auto ft = this->info_.feature_types.ConstDeviceSpan();
ellpack_page_source_.reset(); // release resources.
ellpack_page_source_.reset(new EllpackPageSource(
this->missing_, this->nthreads_, this->Info().num_col_,
this->n_batches_, cache_info_.at(id), param, std::move(cuts),
this->IsDense(), row_stride, ft, sparse_page_source_));
} else {
CHECK(sparse_page_source_);
ellpack_page_source_->Reset();
}
auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
}
} // namespace data
} // namespace xgboost

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2015 by Contributors * Copyright 2015-2021 by Contributors
* \file sparse_page_dmatrix.h * \file sparse_page_dmatrix.h
* \brief External-memory version of DMatrix. * \brief External-memory version of DMatrix.
* \author Tianqi Chen * \author Tianqi Chen
@ -13,24 +13,88 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <map>
#include "ellpack_page_source.h" #include "ellpack_page_source.h"
#include "sparse_page_source.h" #include "sparse_page_source.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
// Used for external memory. /**
* \brief DMatrix used for external memory.
*
* The external memory is created for controlling memory usage by splitting up data into
* multiple batches. However that doesn't mean we will actually process exact 1 batch at
* a time, which would be terribly slow considering that we have to loop through the
* whole dataset for every tree split. So we use async pre-fetch and let caller to decide
* how many batches it wants to process by returning data as shared pointer. The caller
* can use async function to process the data or just stage those batches, making the
* decision is out of the scope for sparse page dmatrix. These 2 optimizations might
* defeat the purpose of splitting up dataset since if you load all the batches then the
* memory usage is even worse than using a single batch. Essentially we need to control
* how many batches can be in memory at the same time.
*
* Right now the write to the cache is sequential operation and is blocking, reading from
* cache is async but with a hard coded limit of 4 pages as an heuristic. So by sparse
* dmatrix itself there can be only 9 pages in main memory (might be of different types)
* at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
* dependent pages. If the caller stops iteration at the middle and start again, then the
* number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
* caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
* sampling algo that samples only the first portion of data).
*
* Of course if the caller decides to retain some batches to perform parallel processing,
* then we might load all pages in memory, which is also considered as a bug in caller's
* code. So if the algo supports external memory, it must be careful that queue for async
* call must have an upper limit.
*
* Another assumption we make is that the data must be immutable so caller should never
* change the data. Sparse page source returns const page to make sure of that. If you
* want to change the generated page like Ellpack, pass parameter into `GetBatches` to
* re-generate them instead of trying to modify the pages in-place.
*
* A possible optimization is dropping the sparse page once dependent pages like ellpack
* are constructed and cached.
*/
class SparsePageDMatrix : public DMatrix { class SparsePageDMatrix : public DMatrix {
MetaInfo info_;
BatchParam batch_param_;
std::map<std::string, std::shared_ptr<Cache>> cache_info_;
DMatrixHandle proxy_;
DataIterHandle iter_;
DataIterResetCallback *reset_;
XGDMatrixCallbackNext *next_;
float missing_;
int nthreads_;
std::string cache_prefix_;
uint32_t n_batches_ {0};
// sparse page is the source to other page types, we make a special member function.
void InitializeSparsePage();
// Non-virtual version that can be used in constructor
BatchSet<SparsePage> GetRowBatchesImpl();
public: public:
template <typename AdapterT> explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy,
explicit SparsePageDMatrix(AdapterT* adapter, float missing, int nthread, DataIterResetCallback *reset,
const std::string& cache_prefix, XGDMatrixCallbackNext *next, float missing,
size_t page_size = kPageSize) int32_t nthreads, std::string cache_prefix);
: cache_info_(std::move(cache_prefix)) {
row_source_.reset(new data::SparsePageSource(adapter, missing, nthread, ~SparsePageDMatrix() override {
cache_prefix, page_size)); // Clear out all resources before deleting the cache file.
sparse_page_source_.reset();
ellpack_page_source_.reset();
column_source_.reset();
sorted_column_source_.reset();
ghist_index_source_.reset();
for (auto const &kv : cache_info_) {
CHECK(kv.second);
auto n = kv.second->ShardName();
TryDeleteCacheFile(n);
}
} }
~SparsePageDMatrix() override = default;
MetaInfo& Info() override; MetaInfo& Info() override;
@ -47,30 +111,41 @@ class SparsePageDMatrix : public DMatrix {
BatchSet<CSCPage> GetColumnBatches() override; BatchSet<CSCPage> GetColumnBatches() override;
BatchSet<SortedCSCPage> GetSortedColumnBatches() override; BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override; BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override { BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override;
LOG(FATAL) << "Not implemented.";
return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(nullptr));
}
// source data pointers. // source data pointers.
std::unique_ptr<SparsePageSource> row_source_; std::shared_ptr<SparsePageSource> sparse_page_source_;
std::unique_ptr<CSCPageSource> column_source_; std::shared_ptr<EllpackPageSource> ellpack_page_source_;
std::unique_ptr<SortedCSCPageSource> sorted_column_source_; std::shared_ptr<CSCPageSource> column_source_;
std::unique_ptr<EllpackPageSource> ellpack_source_; std::shared_ptr<SortedCSCPageSource> sorted_column_source_;
// saved batch param std::shared_ptr<GHistIndexMatrix> ghist_index_source_;
BatchParam batch_param_;
// the cache prefix
std::string cache_info_;
// Store column densities to avoid recalculating
std::vector<float> col_density_;
bool EllpackExists() const override { bool EllpackExists() const override {
return static_cast<bool>(ellpack_source_); return static_cast<bool>(ellpack_page_source_);
} }
bool SparsePageExists() const override { bool SparsePageExists() const override {
return static_cast<bool>(row_source_); return static_cast<bool>(sparse_page_source_);
} }
}; };
inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
std::stringstream ss;
ss << ptr;
return prefix + "-" + ss.str();
}
inline std::string
MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
std::map<std::string, std::shared_ptr<Cache>> *out) {
auto &cache_info = *out;
auto name = MakeId(prefix, ptr);
auto id = name + format;
auto it = cache_info.find(id);
if (it == cache_info.cend()) {
cache_info[id].reset(new Cache{false, name, format});
}
return id;
}
} // namespace data } // namespace data
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_ #endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_

View File

@ -1,77 +0,0 @@
/*!
* Copyright (c) 2020 by XGBoost Contributors
*/
#include "sparse_page_source.h"
namespace xgboost {
namespace data {
void DataPool::Slice(std::shared_ptr<SparsePage> out, size_t offset,
size_t n_rows, size_t entry_offset) const {
auto const &in_offset = pool_.offset.HostVector();
auto const &in_data = pool_.data.HostVector();
auto &h_offset = out->offset.HostVector();
CHECK_LE(offset + n_rows + 1, in_offset.size());
h_offset.resize(n_rows + 1, 0);
std::transform(in_offset.cbegin() + offset,
in_offset.cbegin() + offset + n_rows + 1, h_offset.begin(),
[=](size_t ptr) { return ptr - entry_offset; });
auto &h_data = out->data.HostVector();
CHECK_GT(h_offset.size(), 0);
size_t n_entries = h_offset.back();
h_data.resize(n_entries);
CHECK_EQ(n_entries, in_offset.at(offset + n_rows) - in_offset.at(offset));
std::copy_n(in_data.cbegin() + in_offset.at(offset), n_entries,
h_data.begin());
}
void DataPool::SplitWritePage() {
size_t total = pool_.Size();
size_t offset = 0;
size_t entry_offset = 0;
do {
size_t n_rows = std::min(page_size_, total - offset);
std::shared_ptr<SparsePage> out;
writer_->Alloc(&out);
out->Clear();
out->SetBaseRowId(inferred_num_rows_);
this->Slice(out, offset, n_rows, entry_offset);
inferred_num_rows_ += out->Size();
offset += n_rows;
entry_offset += out->data.Size();
CHECK_NE(out->Size(), 0);
writer_->PushWrite(std::move(out));
} while (total - offset >= page_size_);
if (total - offset != 0) {
auto out = std::make_shared<SparsePage>();
this->Slice(out, offset, total - offset, entry_offset);
CHECK_NE(out->Size(), 0);
pool_.Clear();
pool_.Push(*out);
} else {
pool_.Clear();
}
}
size_t DataPool::Finalize() {
inferred_num_rows_ += pool_.Size();
if (pool_.Size() != 0) {
std::shared_ptr<SparsePage> page;
this->writer_->Alloc(&page);
page->Clear();
page->Push(pool_);
this->writer_->PushWrite(std::move(page));
}
if (inferred_num_rows_ == 0) {
std::shared_ptr<SparsePage> page;
this->writer_->Alloc(&page);
page->Clear();
this->writer_->PushWrite(std::move(page));
}
return inferred_num_rows_;
}
} // namespace data
} // namespace xgboost

View File

@ -0,0 +1,17 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include "sparse_page_source.h"
#include "proxy_dmatrix.cuh"
#include "simple_dmatrix.cuh"
namespace xgboost {
namespace data {
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
auto device = proxy->DeviceIdx();
Dispatch(proxy, [&](auto const &value) {
CopyToSparsePage(value, device, missing, page);
});
}
} // namespace data
} // namespace xgboost

View File

@ -1,54 +1,18 @@
/*! /*!
* Copyright (c) 2014-2019 by Contributors * Copyright (c) 2014-2021 by Contributors
* \file page_csr_source.h * \file sparse_page_source.h
* External memory data source, saved with sparse_batch_page binary format.
* \author Tianqi Chen
*
* -------------------------------------------------
* Random notes on implementation of external memory
* -------------------------------------------------
*
* As of XGBoost 1.3, the general pipeline is:
*
* dmlc text file parser --> file adapter --> sparse page source -> data pool -->
* write to binary cache --> load it back ~~> [ other pages (csc, ellpack, sorted csc) -->
* write to binary cache ] --> use it in various algorithms.
*
* ~~> means optional
*
* The dmlc text file parser returns number of blocks based on available threads, which
* can make the data partitioning non-deterministic, so here we set up an extra data pool
* to stage parsed data. As a result, the number of blocks returned by text parser does
* not equal to number of blocks in binary cache.
*
* Binary cache loading is async by the dmlc threaded iterator, which helps performance,
* but as this iterator itself is not thread safe, so calling
* `dmatrix->GetBatches<SparsePage>` is also not thread safe. Please note that, the
* threaded iterator is also used inside dmlc text file parser.
*
* Memory consumption is difficult to control due to various reasons. Firstly the text
* parsing doesn't have a batch size, only a hard coded buffer size is available.
* Secondly, everything is loaded/written with async queue, with multiple queues running
* the memory consumption is difficult to measure.
*
* The threaded iterator relies heavily on C++ memory model and threading primitive. The
* concurrent writer for binary cache is an old copy of moody queue. We should try to
* replace them with something more robust.
*/ */
#ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
#include <dmlc/threadediter.h> #include <algorithm> // std::min
#include <dmlc/timer.h>
#include <algorithm>
#include <limits>
#include <locale>
#include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <fstream> #include <future>
#include <thread>
#include <map>
#include <memory>
#include "rabit/rabit.h" #include "rabit/rabit.h"
#include "xgboost/base.h" #include "xgboost/base.h"
@ -56,93 +20,12 @@
#include "adapter.h" #include "adapter.h"
#include "sparse_page_writer.h" #include "sparse_page_writer.h"
#include "proxy_dmatrix.h"
#include "../common/common.h" #include "../common/common.h"
#include <xgboost/data.h>
namespace detail {
// Split a cache info string with delimiter ':'
// If cache info string contains drive letter (e.g. C:), exclude it before splitting
inline std::vector<std::string>
GetCacheShards(const std::string& cache_info) {
#if (defined _WIN32) || (defined __CYGWIN__)
if (cache_info.length() >= 2
&& std::isalpha(cache_info[0], std::locale::classic())
&& cache_info[1] == ':') {
std::vector<std::string> cache_shards
= xgboost::common::Split(cache_info.substr(2), ':');
cache_shards[0] = cache_info.substr(0, 2) + cache_shards[0];
return cache_shards;
}
#endif // (defined _WIN32) || (defined __CYGWIN__)
return xgboost::common::Split(cache_info, ':');
}
} // namespace detail
namespace xgboost { namespace xgboost {
namespace data { namespace data {
template<typename S, typename T>
class SparseBatchIteratorImpl : public BatchIteratorImpl<T> {
public:
explicit SparseBatchIteratorImpl(S* source) : source_(source) {
CHECK(source_ != nullptr);
source_->BeforeFirst();
source_->Next();
}
T& operator*() override { return source_->Value(); }
const T& operator*() const override { return source_->Value(); }
void operator++() override { at_end_ = !source_->Next(); }
bool AtEnd() const override { return at_end_; }
private:
S* source_{nullptr};
bool at_end_{ false };
};
/*! \brief magic number used to identify Page */
static const int kMagic = 0xffffab02;
/*!
* \brief decide the format from cache prefix.
* \return pair of row format, column format type of the cache prefix.
*/
inline std::pair<std::string, std::string> DecideFormat(const std::string& cache_prefix) {
size_t pos = cache_prefix.rfind(".fmt-");
if (pos != std::string::npos) {
std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
size_t cpos = fmt.rfind('-');
if (cpos != std::string::npos) {
return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
} else {
return std::make_pair(fmt, fmt);
}
} else {
std::string raw = "raw";
return std::make_pair(raw, raw);
}
}
struct CacheInfo {
std::string name_info;
std::vector<std::string> format_shards;
std::vector<std::string> name_shards;
};
inline CacheInfo ParseCacheInfo(const std::string& cache_info, const std::string& page_type) {
CacheInfo info;
std::vector<std::string> cache_shards = ::detail::GetCacheShards(cache_info);
CHECK_NE(cache_shards.size(), 0U);
// read in the info files.
info.name_info = cache_shards[0];
for (const std::string& prefix : cache_shards) {
info.name_shards.push_back(prefix + page_type);
info.format_shards.push_back(DecideFormat(prefix).first);
}
return info;
}
inline void TryDeleteCacheFile(const std::string& file) { inline void TryDeleteCacheFile(const std::string& file) {
if (std::remove(file.c_str()) != 0) { if (std::remove(file.c_str()) != 0) {
LOG(WARNING) << "Couldn't remove external memory cache file " << file LOG(WARNING) << "Couldn't remove external memory cache file " << file
@ -150,415 +33,327 @@ inline void TryDeleteCacheFile(const std::string& file) {
} }
} }
inline void CheckCacheFileExists(const std::string& file) { struct Cache {
std::ifstream f(file.c_str()); // whether the write to the cache is complete
if (f.good()) { bool written;
LOG(FATAL) std::string name;
<< "Cache file " << file << " exists already; "
<< "Is there another DMatrix with the same "
"cache prefix? It can be caused by previously used DMatrix that "
"hasn't been collected by language environment garbage collector. "
"Otherwise please remove it manually.";
}
}
/**
* \brief Given a set of cache files and page type, this object iterates over batches
* using prefetching for improved performance. Not thread safe.
*
* \tparam PageT Type of the page t.
*/
template <typename PageT>
class ExternalMemoryPrefetcher : dmlc::DataIter<PageT> {
public:
explicit ExternalMemoryPrefetcher(const CacheInfo& info) noexcept(false)
: base_rowid_(0), page_(nullptr), clock_ptr_(0) {
// read in the info files
CHECK_NE(info.name_shards.size(), 0U);
{
std::unique_ptr<dmlc::Stream> finfo(
dmlc::Stream::Create(info.name_info.c_str(), "r"));
int tmagic;
CHECK(finfo->Read(&tmagic));
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
}
files_.resize(info.name_shards.size());
formats_.resize(info.name_shards.size());
prefetchers_.resize(info.name_shards.size());
// read in the cache files.
for (size_t i = 0; i < info.name_shards.size(); ++i) {
std::string name_row = info.name_shards.at(i);
files_[i].reset(dmlc::SeekStream::CreateForRead(name_row.c_str()));
std::unique_ptr<dmlc::SeekStream>& fi = files_[i];
std::string format; std::string format;
CHECK(fi->Read(&format)) << "Invalid page format"; // offset into binary cache file.
formats_[i].reset(CreatePageFormat<PageT>(format)); std::vector<size_t> offset;
std::unique_ptr<SparsePageFormat<PageT>>& fmt = formats_[i];
size_t fbegin = fi->Tell(); Cache(bool w, std::string n, std::string fmt)
prefetchers_[i].reset(new dmlc::ThreadedIter<PageT>(4)); : written{w}, name{std::move(n)}, format{std::move(fmt)} {
prefetchers_[i]->Init( offset.push_back(0);
[&fi, &fmt](PageT** dptr) {
if (*dptr == nullptr) {
*dptr = new PageT();
}
return fmt->Read(*dptr, fi.get());
},
[&fi, fbegin]() { fi->Seek(fbegin); });
}
}
/*! \brief destructor */
~ExternalMemoryPrefetcher() override {
delete page_;
} }
// implement Next static std::string ShardName(std::string name, std::string format) {
bool Next() override { CHECK_EQ(format.front(), '.');
CHECK(mutex_.try_lock()) << "Multiple threads attempting to use prefetcher"; return name + format;
// doing clock rotation over shards.
if (page_ != nullptr) {
size_t n = prefetchers_.size();
prefetchers_[(clock_ptr_ + n - 1) % n]->Recycle(&page_);
} }
if (prefetchers_[clock_ptr_]->Next(&page_)) { std::string ShardName() {
page_->SetBaseRowId(base_rowid_); return ShardName(this->name, this->format);
base_rowid_ += page_->Size(); }
// advance clock
clock_ptr_ = (clock_ptr_ + 1) % prefetchers_.size(); // The write is completed.
mutex_.unlock(); void Commit() {
return true; if (!written) {
} else { std::partial_sum(offset.begin(), offset.end(), offset.begin());
mutex_.unlock(); written = true;
}
}
};
// Prevents multi-threaded call.
class TryLockGuard {
std::mutex& lock_;
public:
explicit TryLockGuard(std::mutex& lock) : lock_{lock} { // NOLINT
CHECK(lock_.try_lock()) << "Multiple threads attempting to use Sparse DMatrix.";
}
~TryLockGuard() {
lock_.unlock();
}
};
template <typename S>
class SparsePageSourceImpl : public BatchIteratorImpl<S> {
protected:
// Prevents calling this iterator from multiple places(or threads).
std::mutex single_threaded_;
std::shared_ptr<S> page_;
bool at_end_ {false};
float missing_;
int nthreads_;
bst_feature_t n_features_;
uint32_t count_{0};
uint32_t n_batches_ {0};
std::shared_ptr<Cache> cache_info_;
std::unique_ptr<dmlc::Stream> fo_;
using Ring = std::vector<std::future<std::shared_ptr<S>>>;
// A ring storing futures to data. Since the DMatrix iterator is forward only, so we
// can pre-fetch data in a ring.
std::unique_ptr<Ring> ring_{new Ring};
bool ReadCache() {
CHECK(!at_end_);
if (!cache_info_->written) {
return false; return false;
} }
if (fo_) {
fo_.reset(); // flush the data to disk.
ring_->resize(n_batches_);
}
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
// to let user adjust number of pre-fetched batches when needed.
uint32_t constexpr kPreFetch = 4;
size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
size_t fetch_it = count_;
for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
fetch_it %= n_batches_; // ring
if (ring_->at(fetch_it).valid()) { continue; }
auto const *self = this; // make sure it's const
CHECK_LT(fetch_it, cache_info_->offset.size());
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
auto n = self->cache_info_->ShardName();
size_t offset = self->cache_info_->offset.at(fetch_it);
std::unique_ptr<dmlc::SeekStream> fi{
dmlc::SeekStream::CreateForRead(n.c_str())};
fi->Seek(offset);
CHECK_EQ(fi->Tell(), offset);
auto page = std::make_shared<S>();
CHECK(fmt->Read(page.get(), fi.get()));
return page;
});
}
CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(),
[](auto const &f) { return f.valid(); }),
n_prefetch_batches)
<< "Sparse DMatrix assumes forward iteration.";
page_ = (*ring_)[count_].get();
return true;
} }
// implement BeforeFirst void WriteCache() {
void BeforeFirst() override { CHECK(!cache_info_->written);
CHECK(mutex_.try_lock()) << "Multiple threads attempting to use prefetcher"; std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
base_rowid_ = 0; if (!fo_) {
clock_ptr_ = 0; auto n = cache_info_->ShardName();
for (auto& p : prefetchers_) { fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
p->BeforeFirst();
} }
mutex_.unlock(); auto bytes = fmt->Write(*page_, fo_.get());
cache_info_->offset.push_back(bytes);
} }
// implement Value virtual void Fetch() = 0;
PageT& Value() { return *page_; }
const PageT& Value() const override { return *page_; }
private:
std::mutex mutex_;
/*! \brief number of rows */
size_t base_rowid_;
/*! \brief page currently on hold. */
PageT* page_;
/*! \brief internal clock ptr */
size_t clock_ptr_;
/*! \brief file pointer to the row blob file. */
std::vector<std::unique_ptr<dmlc::SeekStream>> files_;
/*! \brief Sparse page format file. */
std::vector<std::unique_ptr<SparsePageFormat<PageT>>> formats_;
/*! \brief internal prefetcher. */
std::vector<std::unique_ptr<dmlc::ThreadedIter<PageT>>> prefetchers_;
};
// A data pool to keep the size of each page balanced and data partitioning to be
// deterministic.
class DataPool {
size_t inferred_num_rows_;
MetaInfo* info_;
SparsePage pool_;
size_t page_size_;
SparsePageWriter<SparsePage> *writer_;
void Slice(std::shared_ptr<SparsePage> out, size_t offset, size_t n_rows,
size_t entry_offset) const;
void SplitWritePage();
public: public:
DataPool(MetaInfo *info, size_t page_size, SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
SparsePageWriter<SparsePage> *writer) uint32_t n_batches, std::shared_ptr<Cache> cache)
: inferred_num_rows_{0}, info_{info}, : missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
page_size_{page_size}, writer_{writer} {} n_batches_{n_batches}, cache_info_{std::move(cache)} {}
void Push(std::shared_ptr<SparsePage> page) { SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
info_->num_nonzero_ += page->data.Size();
pool_.Push(*page); ~SparsePageSourceImpl() override {
if (pool_.Size() > page_size_) { for (auto& fu : *ring_) {
this->SplitWritePage(); if (fu.valid()) {
fu.get();
}
} }
page->Clear();
} }
size_t Finalize(); uint32_t Iter() const { return count_; }
const S &operator*() const override {
CHECK(page_);
return *page_;
}
std::shared_ptr<S const> Page() const override {
return page_;
}
bool AtEnd() const override {
return at_end_;
}
virtual void Reset() {
TryLockGuard guard{single_threaded_};
at_end_ = false;
count_ = 0;
this->Fetch();
}
}; };
class SparsePageSource { #if defined(XGBOOST_USE_CUDA)
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
#else
inline void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
common::AssertGPUSupport();
}
#endif
class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
DMatrixProxy* proxy_;
size_t base_row_id_ {0};
void Fetch() final {
page_ = std::make_shared<SparsePage>();
if (!this->ReadCache()) {
bool type_error { false };
CHECK(proxy_);
HostAdapterDispatch(proxy_, [&](auto const &adapter_batch) {
page_->Push(adapter_batch, this->missing_, this->nthreads_);
}, &type_error);
if (type_error) {
DevicePush(proxy_, missing_, page_.get());
}
page_->SetBaseRowId(base_row_id_);
base_row_id_ += page_->Size();
n_batches_++;
this->WriteCache();
}
}
public: public:
template <typename AdapterT> SparsePageSource(
SparsePageSource(AdapterT* adapter, float missing, int nthread, DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter,
const std::string& cache_info, DMatrixProxy *proxy, float missing, int nthreads,
const size_t page_size = DMatrix::kPageSize) { bst_feature_t n_features, uint32_t n_batches, std::shared_ptr<Cache> cache)
const std::string page_type = ".row.page"; : SparsePageSourceImpl(missing, nthreads, n_features, n_batches, cache),
cache_info_ = ParseCacheInfo(cache_info, page_type); iter_{iter}, proxy_{proxy} {
if (!cache_info_->written) {
// Warn user if old cache files iter_.Reset();
CheckCacheFileExists(cache_info_.name_info); iter_.Next();
for (auto file : cache_info_.name_shards) { }
CheckCacheFileExists(file); this->Fetch();
} }
{ SparsePageSource& operator++() final {
SparsePageWriter<SparsePage> writer(cache_info_.name_shards, TryLockGuard guard{single_threaded_};
cache_info_.format_shards, 6); count_++;
DataPool pool(&info, page_size, &writer); if (cache_info_->written) {
at_end_ = (count_ == n_batches_);
std::shared_ptr<SparsePage> page { new SparsePage };
uint64_t inferred_num_columns = 0;
uint64_t inferred_num_rows = 0;
const uint64_t default_max = std::numeric_limits<uint64_t>::max();
uint64_t last_group_id = default_max;
bst_uint group_size = 0;
std::vector<uint64_t> qids;
adapter->BeforeFirst();
while (adapter->Next()) {
auto& batch = adapter->Value();
if (batch.Labels() != nullptr) {
auto& labels = info.labels_.HostVector();
labels.insert(labels.end(), batch.Labels(),
batch.Labels() + batch.Size());
}
if (batch.Weights() != nullptr) {
auto& weights = info.weights_.HostVector();
weights.insert(weights.end(), batch.Weights(),
batch.Weights() + batch.Size());
}
if (batch.BaseMargin() != nullptr) {
auto& base_margin = info.base_margin_.HostVector();
base_margin.insert(base_margin.end(), batch.BaseMargin(),
batch.BaseMargin() + batch.Size());
}
if (batch.Qid() != nullptr) {
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
// get group
for (size_t i = 0; i < batch.Size(); ++i) {
const uint64_t cur_group_id = batch.Qid()[i];
if (last_group_id == default_max ||
last_group_id != cur_group_id) {
info.group_ptr_.push_back(group_size);
}
last_group_id = cur_group_id;
++group_size;
}
}
CHECK_EQ(page->Size(), 0);
auto batch_max_columns = page->Push(batch, missing, nthread);
inferred_num_columns =
std::max(batch_max_columns, inferred_num_columns);
inferred_num_rows += page->Size();
pool.Push(page);
page->SetBaseRowId(inferred_num_rows);
}
if (last_group_id != default_max) {
if (group_size > info.group_ptr_.back()) {
info.group_ptr_.push_back(group_size);
}
}
// Deal with empty rows/columns if necessary
if (adapter->NumColumns() == kAdapterUnknownSize) {
info.num_col_ = inferred_num_columns;
} else { } else {
info.num_col_ = adapter->NumColumns(); at_end_ = !iter_.Next();
} }
// Synchronise worker columns
rabit::Allreduce<rabit::op::Max>(&info.num_col_, 1);
if (adapter->NumRows() == kAdapterUnknownSize) { if (at_end_) {
info.num_row_ = inferred_num_rows; cache_info_->Commit();
if (n_batches_ != 0) {
CHECK_EQ(count_, n_batches_);
}
CHECK_GE(count_, 1);
proxy_ = nullptr;
} else { } else {
if (page->offset.HostVector().empty()) { this->Fetch();
page->offset.HostVector().emplace_back(0); }
return *this;
} }
while (inferred_num_rows < adapter->NumRows()) { void Reset() override {
page->offset.HostVector().emplace_back( if (proxy_) {
page->offset.HostVector().back()); TryLockGuard guard{single_threaded_};
inferred_num_rows++; iter_.Reset();
}
info.num_row_ = adapter->NumRows();
} }
SparsePageSourceImpl::Reset();
pool.Push(page); TryLockGuard guard{single_threaded_};
pool.Finalize(); base_row_id_ = 0;
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(cache_info_.name_info.c_str(), "w"));
int tmagic = kMagic;
fo->Write(tmagic);
// Either every row has query ID or none at all
CHECK(qids.empty() || qids.size() == info.num_row_);
info.SaveBinary(fo.get());
} }
LOG(INFO) << "SparsePageSource Finished writing to "
<< cache_info_.name_info;
external_prefetcher_.reset(
new ExternalMemoryPrefetcher<SparsePage>(cache_info_));
}
~SparsePageSource() {
external_prefetcher_.reset();
TryDeleteCacheFile(cache_info_.name_info);
for (auto file : cache_info_.name_shards) {
TryDeleteCacheFile(file);
}
}
BatchSet<SparsePage> GetBatchSet() {
auto begin_iter = BatchIterator<SparsePage>(
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<SparsePage>,
SparsePage>(external_prefetcher_.get()));
return BatchSet<SparsePage>(begin_iter);
}
MetaInfo info;
private:
std::unique_ptr<ExternalMemoryPrefetcher<SparsePage>> external_prefetcher_;
CacheInfo cache_info_;
}; };
class CSCPageSource { // A mixin for advancing the iterator.
template <typename S>
class PageSourceIncMixIn : public SparsePageSourceImpl<S> {
protected:
std::shared_ptr<SparsePageSource> source_;
public: public:
CSCPageSource(DMatrix* src, const std::string& cache_info, using SparsePageSourceImpl<S>::SparsePageSourceImpl;
const size_t page_size = DMatrix::kPageSize) { PageSourceIncMixIn& operator++() final {
std::string page_type = ".col.page"; TryLockGuard guard{this->single_threaded_};
cache_info_ = ParseCacheInfo(cache_info, page_type); ++(*source_);
for (auto file : cache_info_.name_shards) {
CheckCacheFileExists(file);
}
{
SparsePageWriter<SparsePage> writer(cache_info_.name_shards,
cache_info_.format_shards, 6);
std::shared_ptr<SparsePage> page;
writer.Alloc(&page);
page->Clear();
size_t bytes_write = 0; ++this->count_;
double tstart = dmlc::GetTime(); this->at_end_ = source_->AtEnd();
for (auto& batch : src->GetBatches<SparsePage>()) {
page->PushCSC(batch.GetTranspose(src->Info().num_col_));
if (page->MemCostBytes() >= page_size) { if (this->at_end_) {
bytes_write += page->MemCostBytes(); this->cache_info_->Commit();
writer.PushWrite(std::move(page)); if (this->n_batches_ != 0) {
writer.Alloc(&page); CHECK_EQ(this->count_, this->n_batches_);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
LOG(INFO) << "Writing to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
} }
CHECK_GE(this->count_, 1);
} else {
this->Fetch();
} }
if (page->data.Size() != 0) { CHECK_EQ(source_->Iter(), this->count_);
writer.PushWrite(std::move(page)); return *this;
} }
LOG(INFO) << "CSCPageSource: Finished writing to "
<< cache_info_.name_info;
}
external_prefetcher_.reset(
new ExternalMemoryPrefetcher<CSCPage>(cache_info_));
}
~CSCPageSource() {
external_prefetcher_.reset();
for (auto file : cache_info_.name_shards) {
TryDeleteCacheFile(file);
}
}
BatchSet<CSCPage> GetBatchSet() {
auto begin_iter = BatchIterator<CSCPage>(
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<CSCPage>, CSCPage>(
external_prefetcher_.get()));
return BatchSet<CSCPage>(begin_iter);
}
private:
std::unique_ptr<ExternalMemoryPrefetcher<CSCPage>> external_prefetcher_;
CacheInfo cache_info_;
}; };
class SortedCSCPageSource { class CSCPageSource : public PageSourceIncMixIn<CSCPage> {
protected:
void Fetch() final {
if (!this->ReadCache()) {
auto const &csr = source_->Page();
this->page_.reset(new CSCPage{});
// we might be able to optimize this by merging transpose and pushcsc
this->page_->PushCSC(csr->GetTranspose(n_features_));
page_->SetBaseRowId(csr->base_rowid);
this->WriteCache();
}
}
public: public:
SortedCSCPageSource(DMatrix* src, const std::string& cache_info, CSCPageSource(
const size_t page_size = DMatrix::kPageSize) { float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
std::string page_type = ".sorted.col.page"; std::shared_ptr<Cache> cache,
cache_info_ = ParseCacheInfo(cache_info, page_type); std::shared_ptr<SparsePageSource> source)
for (auto file : cache_info_.name_shards) { : PageSourceIncMixIn(missing, nthreads, n_features,
CheckCacheFileExists(file); n_batches, cache) {
this->source_ = source;
this->Fetch();
} }
{
SparsePageWriter<SparsePage> writer(cache_info_.name_shards,
cache_info_.format_shards, 6);
std::shared_ptr<SparsePage> page;
writer.Alloc(&page);
page->Clear();
size_t bytes_write = 0;
double tstart = dmlc::GetTime();
for (auto& batch : src->GetBatches<SparsePage>()) {
SparsePage tmp = batch.GetTranspose(src->Info().num_col_);
page->PushCSC(tmp);
page->SortRows();
if (page->MemCostBytes() >= page_size) {
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
LOG(INFO) << "Writing to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
}
}
if (page->data.Size() != 0) {
writer.PushWrite(std::move(page));
}
LOG(INFO) << "SortedCSCPageSource: Finished writing to "
<< cache_info_.name_info;
}
external_prefetcher_.reset(
new ExternalMemoryPrefetcher<SortedCSCPage>(cache_info_));
}
~SortedCSCPageSource() {
external_prefetcher_.reset();
for (auto file : cache_info_.name_shards) {
TryDeleteCacheFile(file);
}
}
BatchSet<SortedCSCPage> GetBatchSet() {
auto begin_iter = BatchIterator<SortedCSCPage>(
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<SortedCSCPage>,
SortedCSCPage>(external_prefetcher_.get()));
return BatchSet<SortedCSCPage>(begin_iter);
}
private:
std::unique_ptr<ExternalMemoryPrefetcher<SortedCSCPage>> external_prefetcher_;
CacheInfo cache_info_;
}; };
class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
protected:
void Fetch() final {
if (!this->ReadCache()) {
auto const &csr = this->source_->Page();
this->page_.reset(new SortedCSCPage{});
// we might be able to optimize this by merging transpose and pushcsc
this->page_->PushCSC(csr->GetTranspose(n_features_));
CHECK_EQ(this->page_->Size(), n_features_);
CHECK_EQ(this->page_->data.Size(), csr->data.Size());
this->page_->SortRows();
page_->SetBaseRowId(csr->base_rowid);
this->WriteCache();
}
}
public:
SortedCSCPageSource(float missing, int nthreads, bst_feature_t n_features,
uint32_t n_batches, std::shared_ptr<Cache> cache,
std::shared_ptr<SparsePageSource> source)
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache) {
this->source_ = source;
this->Fetch();
}
};
} // namespace data } // namespace data
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ #endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

View File

@ -63,103 +63,6 @@ inline SparsePageFormat<T>* CreatePageFormat(const std::string& name) {
return (e->body)(); return (e->body)();
} }
#if DMLC_ENABLE_STD_THREAD
/*!
* \brief A threaded writer to write sparse batch page to sharded files.
* @tparam T Type of the page.
*/
template<typename T>
class SparsePageWriter {
public:
/*!
* \brief constructor
* \param name_shards name of shard files.
* \param format_shards format of each shard.
* \param extra_buffer_capacity Extra buffer capacity before block.
*/
explicit SparsePageWriter(const std::vector<std::string>& name_shards,
const std::vector<std::string>& format_shards,
size_t extra_buffer_capacity)
: num_free_buffer_(extra_buffer_capacity + name_shards.size()),
clock_ptr_(0),
workers_(name_shards.size()),
qworkers_(name_shards.size()) {
CHECK_EQ(name_shards.size(), format_shards.size());
// start writer threads
for (size_t i = 0; i < name_shards.size(); ++i) {
std::string name_shard = name_shards[i];
std::string format_shard = format_shards[i];
auto* wqueue = &qworkers_[i];
workers_[i].reset(new std::thread(
[this, name_shard, format_shard, wqueue]() {
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(name_shard.c_str(), "w"));
std::unique_ptr<SparsePageFormat<T>> fmt(CreatePageFormat<T>(format_shard));
fo->Write(format_shard);
std::shared_ptr<T> page;
while (wqueue->Pop(&page)) {
if (page == nullptr) break;
fmt->Write(*page, fo.get());
qrecycle_.Push(std::move(page));
}
fo.reset(nullptr);
LOG(INFO) << "SparsePageWriter Finished writing to " << name_shard;
}));
}
}
/*! \brief destructor, will close the files automatically */
~SparsePageWriter() {
for (auto& queue : qworkers_) {
// use nullptr to signal termination.
std::shared_ptr<T> sig(nullptr);
queue.Push(std::move(sig));
}
for (auto& thread : workers_) {
thread->join();
}
}
/*!
* \brief Push a write job to the writer.
* This function won't block,
* writing is done by another thread inside writer.
* \param page The page to be written
*/
void PushWrite(std::shared_ptr<T>&& page) {
qworkers_[clock_ptr_].Push(std::move(page));
clock_ptr_ = (clock_ptr_ + 1) % workers_.size();
}
/*!
* \brief Allocate a page to store results.
* This function can block when the writer is too slow and buffer pages
* have not yet been recycled.
* \param out_page Used to store the allocated pages.
*/
void Alloc(std::shared_ptr<T>* out_page) {
CHECK(*out_page == nullptr);
if (num_free_buffer_ != 0) {
out_page->reset(new T());
--num_free_buffer_;
} else {
CHECK(qrecycle_.Pop(out_page));
}
}
private:
/*! \brief number of allocated pages */
size_t num_free_buffer_;
/*! \brief clock_pointer */
size_t clock_ptr_;
/*! \brief writer threads */
std::vector<std::unique_ptr<std::thread>> workers_;
/*! \brief recycler queue */
dmlc::ConcurrentBlockingQueue<std::shared_ptr<T>> qrecycle_;
/*! \brief worker threads */
std::vector<dmlc::ConcurrentBlockingQueue<std::shared_ptr<T>>> qworkers_;
};
#endif // DMLC_ENABLE_STD_THREAD
/*! /*!
* \brief Registry entry for sparse page format. * \brief Registry entry for sparse page format.
*/ */

View File

@ -131,7 +131,7 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
if (rnd_(i) <= p) { if (rnd_(i) <= p) {
return gpair / p; return gpair / p;
} else { } else {
return GradientPair(); return {};
} }
} }
} }
@ -143,13 +143,13 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
CombineGradientPair combine_; CombineGradientPair combine_;
}; };
NoSampling::NoSampling(EllpackPageImpl* page) : page_(page) {} NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) { GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
return {dmat->Info().num_row_, page_, gpair}; return {dmat->Info().num_row_, page_, gpair};
} }
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page, ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param) const BatchParam& batch_param)
: batch_param_(batch_param), : batch_param_(batch_param),
@ -171,7 +171,7 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
return {dmat->Info().num_row_, page_.get(), gpair}; return {dmat->Info().num_row_, page_.get(), gpair};
} }
UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample) UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
: page_(page), subsample_(subsample) {} : page_(page), subsample_(subsample) {}
GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) { GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
@ -183,7 +183,7 @@ GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DM
return {dmat->Info().num_row_, page_, gpair}; return {dmat->Info().num_row_, page_, gpair};
} }
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page, ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample) float subsample)
@ -231,7 +231,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
return {sample_rows, page_.get(), dh::ToSpan(gpair_)}; return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
} }
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl* page, GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam&, const BatchParam&,
float subsample) float subsample)
@ -257,7 +257,7 @@ GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpa
} }
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling( ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
EllpackPageImpl* page, EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample) float subsample)
@ -313,7 +313,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
return {sample_rows, page_.get(), dh::ToSpan(gpair_)}; return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
} }
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page, GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample, float subsample,

View File

@ -16,7 +16,7 @@ struct GradientBasedSample {
/*!\brief Number of sampled rows. */ /*!\brief Number of sampled rows. */
size_t sample_rows; size_t sample_rows;
/*!\brief Sampled rows in ELLPACK format. */ /*!\brief Sampled rows in ELLPACK format. */
EllpackPageImpl* page; EllpackPageImpl const* page;
/*!\brief Gradient pairs for the sampled rows. */ /*!\brief Gradient pairs for the sampled rows. */
common::Span<GradientPair> gpair; common::Span<GradientPair> gpair;
}; };
@ -31,17 +31,17 @@ class SamplingStrategy {
/*! \brief No sampling in in-memory mode. */ /*! \brief No sampling in in-memory mode. */
class NoSampling : public SamplingStrategy { class NoSampling : public SamplingStrategy {
public: public:
explicit NoSampling(EllpackPageImpl* page); explicit NoSampling(EllpackPageImpl const* page);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override; GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private: private:
EllpackPageImpl* page_; EllpackPageImpl const* page_;
}; };
/*! \brief No sampling in external memory mode. */ /*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy { class ExternalMemoryNoSampling : public SamplingStrategy {
public: public:
ExternalMemoryNoSampling(EllpackPageImpl* page, ExternalMemoryNoSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param); const BatchParam& batch_param);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override; GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
@ -55,25 +55,25 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
/*! \brief Uniform sampling in in-memory mode. */ /*! \brief Uniform sampling in in-memory mode. */
class UniformSampling : public SamplingStrategy { class UniformSampling : public SamplingStrategy {
public: public:
UniformSampling(EllpackPageImpl* page, float subsample); UniformSampling(EllpackPageImpl const* page, float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override; GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private: private:
EllpackPageImpl* page_; EllpackPageImpl const* page_;
float subsample_; float subsample_;
}; };
/*! \brief No sampling in external memory mode. */ /*! \brief No sampling in external memory mode. */
class ExternalMemoryUniformSampling : public SamplingStrategy { class ExternalMemoryUniformSampling : public SamplingStrategy {
public: public:
ExternalMemoryUniformSampling(EllpackPageImpl* page, ExternalMemoryUniformSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample); float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override; GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private: private:
EllpackPageImpl* original_page_; EllpackPageImpl const* original_page_;
BatchParam batch_param_; BatchParam batch_param_;
float subsample_; float subsample_;
std::unique_ptr<EllpackPageImpl> page_; std::unique_ptr<EllpackPageImpl> page_;
@ -84,14 +84,14 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in in-memory mode.. */ /*! \brief Gradient-based sampling in in-memory mode.. */
class GradientBasedSampling : public SamplingStrategy { class GradientBasedSampling : public SamplingStrategy {
public: public:
GradientBasedSampling(EllpackPageImpl* page, GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample); float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override; GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private: private:
EllpackPageImpl* page_; EllpackPageImpl const* page_;
float subsample_; float subsample_;
dh::caching_device_vector<float> threshold_; dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_; dh::caching_device_vector<float> grad_sum_;
@ -100,14 +100,14 @@ class GradientBasedSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in external memory mode.. */ /*! \brief Gradient-based sampling in external memory mode.. */
class ExternalMemoryGradientBasedSampling : public SamplingStrategy { class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
public: public:
ExternalMemoryGradientBasedSampling(EllpackPageImpl* page, ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample); float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override; GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private: private:
EllpackPageImpl* original_page_; EllpackPageImpl const* original_page_;
BatchParam batch_param_; BatchParam batch_param_;
float subsample_; float subsample_;
dh::caching_device_vector<float> threshold_; dh::caching_device_vector<float> threshold_;
@ -128,7 +128,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
*/ */
class GradientBasedSampler { class GradientBasedSampler {
public: public:
GradientBasedSampler(EllpackPageImpl* page, GradientBasedSampler(EllpackPageImpl const* page,
size_t n_rows, size_t n_rows,
const BatchParam& batch_param, const BatchParam& batch_param,
float subsample, float subsample,

View File

@ -163,7 +163,7 @@ class DeviceHistogram {
template <typename GradientSumT> template <typename GradientSumT>
struct GPUHistMakerDevice { struct GPUHistMakerDevice {
int device_id; int device_id;
EllpackPageImpl* page; EllpackPageImpl const* page;
common::Span<FeatureType const> feature_types; common::Span<FeatureType const> feature_types;
BatchParam batch_param; BatchParam batch_param;
@ -199,7 +199,7 @@ struct GPUHistMakerDevice {
dh::caching_device_vector<uint32_t> node_categories; dh::caching_device_vector<uint32_t> node_categories;
GPUHistMakerDevice(int _device_id, GPUHistMakerDevice(int _device_id,
EllpackPageImpl* _page, EllpackPageImpl const* _page,
common::Span<FeatureType const> _feature_types, common::Span<FeatureType const> _feature_types,
bst_uint _n_rows, bst_uint _n_rows,
TrainParam _param, TrainParam _param,
@ -488,7 +488,7 @@ struct GPUHistMakerDevice {
} }
} }
void FinalisePositionInPage(EllpackPageImpl *page, void FinalisePositionInPage(EllpackPageImpl const *page,
const common::Span<RegTree::Node> d_nodes, const common::Span<RegTree::Node> d_nodes,
common::Span<FeatureType const> d_feature_types, common::Span<FeatureType const> d_feature_types,
common::Span<uint32_t const> categories, common::Span<uint32_t const> categories,
@ -812,7 +812,6 @@ class GPUHistMakerSpecialised {
BatchParam batch_param{ BatchParam batch_param{
device_, device_,
param_.max_bin, param_.max_bin,
generic_param_->gpu_page_size
}; };
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));

View File

@ -125,12 +125,10 @@ TEST(DenseColumnWithMissing, Test) {
} }
void TestGHistIndexMatrixCreation(size_t nthreads) { void TestGHistIndexMatrixCreation(size_t nthreads) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
/* This should create multiple sparse pages */ /* This should create multiple sparse pages */
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries, kPageSize, filename) }; std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries) };
omp_set_num_threads(nthreads); omp_set_num_threads(nthreads);
GHistIndexMatrix gmat(dmat.get(), 256); GHistIndexMatrix gmat(dmat.get(), 256);
} }

View File

@ -83,7 +83,7 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
} }
fo.close(); fo.close();
return std::shared_ptr<DMatrix>(DMatrix::Load( return std::shared_ptr<DMatrix>(DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)); tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
} }
// Test that elements are approximately equally distributed among bins // Test that elements are approximately equally distributed among bins

View File

@ -59,12 +59,9 @@ TEST(SparsePage, PushCSC) {
} }
TEST(SparsePage, PushCSCAfterTranspose) { TEST(SparsePage, PushCSCAfterTranspose) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
CreateSparsePageDMatrix(kEntries, 64UL, filename);
const int ncols = dmat->Info().num_col_; const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page SparsePage page; // Consolidated sparse page
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) { for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
@ -76,12 +73,12 @@ TEST(SparsePage, PushCSCAfterTranspose) {
// Make sure that the final sparse page has the right number of entries // Make sure that the final sparse page has the right number of entries
ASSERT_EQ(kEntries, page.data.Size()); ASSERT_EQ(kEntries, page.data.Size());
// The feature value for a feature in each row should be identical, as that is page.SortRows();
// how the dmatrix has been created auto v = page.GetView();
for (size_t i = 0; i < page.Size(); ++i) { for (size_t i = 0; i < v.Size(); ++i) {
auto inst = page.GetView()[i]; auto column = v[i];
for (size_t j = 1; j < inst.size(); ++j) { for (size_t j = 1; j < column.size(); ++j) {
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue); ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
} }
} }
} }

View File

@ -142,7 +142,7 @@ TEST(EllpackPage, Copy) {
dmlc::TemporaryDirectory tmpdir; dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix> std::unique_ptr<DMatrix>
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 256, kPageSize}; BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
// Create an empty result page. // Create an empty result page.
@ -188,7 +188,7 @@ TEST(EllpackPage, Compact) {
dmlc::TemporaryDirectory tmpdir; dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix> std::unique_ptr<DMatrix>
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 256, kPageSize}; BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
// Create an empty result page. // Create an empty result page.
@ -212,7 +212,7 @@ TEST(EllpackPage, Compact) {
std::vector<bst_float> row_result(kCols); std::vector<bst_float> row_result(kCols);
for (auto& page : dmat->GetBatches<EllpackPage>(param)) { for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
auto impl = page.Impl(); auto impl = page.Impl();
EXPECT_EQ(impl->base_rowid, current_row); ASSERT_EQ(impl->base_rowid, current_row);
for (size_t i = 0; i < impl->Size(); i++) { for (size_t i = 0; i < impl->Size(); i++) {
size_t compacted_row = row_indexes_h[current_row]; size_t compacted_row = row_indexes_h[current_row];

View File

@ -0,0 +1,46 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <memory>
#include "../../../src/data/file_iterator.h"
#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/data/adapter.h"
#include "../helpers.h"
namespace xgboost {
namespace data {
TEST(FileIterator, Basic) {
auto check_n_features = [](FileIterator *iter) {
size_t n_features = 0;
iter->Reset();
while (iter->Next()) {
auto proxy = MakeProxy(iter->Proxy());
auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
n_features = std::max(n_features, csr->NumColumns());
}
ASSERT_EQ(n_features, 5);
};
dmlc::TemporaryDirectory tmpdir;
{
auto zpath = tmpdir.path + "/0-based.svm";
CreateBigTestData(zpath, 3 * 64, true);
zpath += "?indexing_mode=0";
FileIterator iter{zpath, 0, 1, "libsvm"};
check_n_features(&iter);
}
{
auto opath = tmpdir.path + "/1-based.svm";
CreateBigTestData(opath, 3 * 64, false);
opath += "?indexing_mode=1";
FileIterator iter{opath, 0, 1, "libsvm"};
check_n_features(&iter);
}
}
} // namespace data
} // namespace xgboost

View File

@ -142,7 +142,7 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
IterativeDeviceDMatrix m( IterativeDeviceDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), &iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256); 0, 256);
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256, 0}).begin(); auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
auto impl = ellpack.Impl(); auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator( common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); impl->gidx_buffer.HostVector().data(), impl->NumSymbols());

View File

@ -260,7 +260,7 @@ TEST(MetaInfo, HostExtend) {
lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size()); lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size()); rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
lhs.Extend(rhs, true); lhs.Extend(rhs, true, true);
ASSERT_EQ(lhs.num_row_, kRows * 2); ASSERT_EQ(lhs.num_row_, kRows * 2);
ASSERT_TRUE(lhs.labels_.HostCanRead()); ASSERT_TRUE(lhs.labels_.HostCanRead());
ASSERT_TRUE(rhs.labels_.HostCanRead()); ASSERT_TRUE(rhs.labels_.HostCanRead());

View File

@ -141,7 +141,7 @@ TEST(MetaInfo, DeviceExtend) {
lhs.num_row_ = kRows; lhs.num_row_ = kRows;
rhs.num_row_ = kRows; rhs.num_row_ = kRows;
lhs.Extend(rhs, true); lhs.Extend(rhs, true, true);
ASSERT_EQ(lhs.num_row_, kRows * 2); ASSERT_EQ(lhs.num_row_, kRows * 2);
ASSERT_FALSE(lhs.labels_.HostCanRead()); ASSERT_FALSE(lhs.labels_.HostCanRead());

View File

@ -6,11 +6,100 @@
#include <future> #include <future>
#include "../../../src/common/io.h" #include "../../../src/common/io.h"
#include "../../../src/data/adapter.h" #include "../../../src/data/adapter.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/data/sparse_page_dmatrix.h" #include "../../../src/data/sparse_page_dmatrix.h"
#include "../../../src/data/file_iterator.h"
#include "../helpers.h" #include "../helpers.h"
using namespace xgboost; // NOLINT using namespace xgboost; // NOLINT
template <typename Page>
void TestSparseDMatrixLoadFile() {
dmlc::TemporaryDirectory tmpdir;
auto opath = tmpdir.path + "/1-based.svm";
CreateBigTestData(opath, 3 * 64, false);
opath += "?indexing_mode=1";
data::FileIterator iter{opath, 0, 1, "libsvm"};
data::SparsePageDMatrix m{&iter,
iter.Proxy(),
data::fileiter::Reset,
data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(),
1,
"cache"};
ASSERT_EQ(m.Info().num_col_, 5);
ASSERT_EQ(m.Info().num_row_, 64);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(opath.c_str(), 0, 1, "auto"));
auto adapter = data::FileAdapter{parser.get()};
data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
1};
Page out;
for (auto const& page : m.GetBatches<Page>()) {
if (std::is_same<Page, SparsePage>::value) {
out.Push(page);
} else {
out.PushCSC(page);
}
}
ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
for (auto const& page : simple.GetBatches<Page>()) {
ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
for (size_t i = 0; i < page.data.Size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
}
}
}
TEST(SparsePageDMatrix, LoadFile) {
TestSparseDMatrixLoadFile<SparsePage>();
TestSparseDMatrixLoadFile<CSCPage>();
TestSparseDMatrixLoadFile<SortedCSCPage>();
}
// allow caller to retain pages so they can process multiple pages at the same time.
template <typename Page>
void TestRetainPage() {
auto m = CreateSparsePageDMatrix(10000);
auto batches = m->GetBatches<Page>();
auto begin = batches.begin();
auto end = batches.end();
std::vector<Page> pages;
std::vector<std::shared_ptr<Page const>> iterators;
for (auto it = begin; it != end; ++it) {
iterators.push_back(it.Page());
pages.emplace_back(Page{});
if (std::is_same<Page, SparsePage>::value) {
pages.back().Push(*it);
} else {
pages.back().PushCSC(*it);
}
ASSERT_EQ(pages.back().Size(), (*it).Size());
}
ASSERT_GE(iterators.size(), 2);
for (size_t i = 0; i < iterators.size(); ++i) {
ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
ASSERT_EQ((*iterators[i]).data.HostVector(), pages.at(i).data.HostVector());
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<Page>()) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
}
}
TEST(SparsePageDMatrix, RetainSparsePage) {
TestRetainPage<SparsePage>();
TestRetainPage<CSCPage>();
TestRetainPage<SortedCSCPage>();
}
TEST(SparsePageDMatrix, MetaInfo) { TEST(SparsePageDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir; dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm"; const std::string tmp_file = tempdir.path + "/simple.libsvm";
@ -19,8 +108,6 @@ TEST(SparsePageDMatrix, MetaInfo) {
xgboost::DMatrix *dmat = xgboost::DMatrix::Load( xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", false, false); tmp_file + "#" + tmp_file + ".cache", false, false);
std::cout << tmp_file << std::endl;
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
// Test the metadata that was parsed // Test the metadata that was parsed
EXPECT_EQ(dmat->Info().num_row_, 8ul); EXPECT_EQ(dmat->Info().num_row_, 8ul);
@ -32,10 +119,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
} }
TEST(SparsePageDMatrix, RowAccess) { TEST(SparsePageDMatrix, RowAccess) {
dmlc::TemporaryDirectory tmpdir; std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(24);
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(24, 4, filename);
// Test the data read into the first row // Test the data read into the first row
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin(); auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@ -43,7 +127,7 @@ TEST(SparsePageDMatrix, RowAccess) {
auto first_row = page[0]; auto first_row = page[0];
ASSERT_EQ(first_row.size(), 3ul); ASSERT_EQ(first_row.size(), 3ul);
EXPECT_EQ(first_row[2].index, 2u); EXPECT_EQ(first_row[2].index, 2u);
EXPECT_EQ(first_row[2].fvalue, 20); EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4);
} }
TEST(SparsePageDMatrix, ColAccess) { TEST(SparsePageDMatrix, ColAccess) {
@ -54,55 +138,46 @@ TEST(SparsePageDMatrix, ColAccess) {
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false); xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
// Loop over the batches and assert the data is as expected // Loop over the batches and assert the data is as expected
size_t iter = 0;
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) { for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
auto col_page = col_batch.GetView(); auto col_page = col_batch.GetView();
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_); ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_page[1][0].fvalue, 10.0f); if (iter == 1) {
EXPECT_EQ(col_page[1].size(), 1); ASSERT_EQ(col_page[0][0].fvalue, 0.f);
ASSERT_EQ(col_page[3][0].fvalue, 30.f);
ASSERT_EQ(col_page[3][0].index, 1);
ASSERT_EQ(col_page[3].size(), 1);
} else {
ASSERT_EQ(col_page[1][0].fvalue, 10.0f);
ASSERT_EQ(col_page[1].size(), 1);
}
CHECK_LE(col_batch.base_rowid, dmat->Info().num_row_);
++iter;
} }
// Loop over the batches and assert the data is as expected // Loop over the batches and assert the data is as expected
iter = 0;
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) { for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
auto col_page = col_batch.GetView(); auto col_page = col_batch.GetView();
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_); EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
if (iter == 0) {
EXPECT_EQ(col_page[1][0].fvalue, 10.0f); EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
EXPECT_EQ(col_page[1].size(), 1); EXPECT_EQ(col_page[1].size(), 1);
} else {
EXPECT_EQ(col_page[3][0].fvalue, 30.f);
EXPECT_EQ(col_page[3].size(), 1);
}
iter++;
} }
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
delete dmat; delete dmat;
EXPECT_FALSE(FileExists(tmp_file + ".cache"));
EXPECT_FALSE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_FALSE(FileExists(tmp_file + ".cache.sorted.col.page"));
}
TEST(SparsePageDMatrix, ExistingCacheFile) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
EXPECT_ANY_THROW({
std::unique_ptr<xgboost::DMatrix> dmat2 =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
});
} }
TEST(SparsePageDMatrix, ThreadSafetyException) { TEST(SparsePageDMatrix, ThreadSafetyException) {
dmlc::TemporaryDirectory tmpdir; size_t constexpr kEntriesPerCol = 3;
std::string filename = tmpdir.path + "/test"; size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<xgboost::DMatrix> dmat = std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename); xgboost::CreateSparsePageDMatrix(kEntries);
int threads = 1000; int threads = 1000;
@ -134,13 +209,10 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
// Multi-batches access // Multi-batches access
TEST(SparsePageDMatrix, ColAccessBatches) { TEST(SparsePageDMatrix, ColAccessBatches) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
// Create multiple sparse pages // Create multiple sparse pages
std::unique_ptr<xgboost::DMatrix> dmat{ std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename)};
auto n_threads = omp_get_max_threads(); auto n_threads = omp_get_max_threads();
omp_set_num_threads(16); omp_set_num_threads(16);
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) { for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
@ -149,234 +221,37 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
omp_set_num_threads(n_threads); omp_set_num_threads(n_threads);
} }
TEST(SparsePageDMatrix, Empty) { auto TestSparsePageDMatrixDeterminism(int32_t threads) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{};
std::vector<unsigned> feature_idx = {};
std::vector<size_t> row_ptr = {};
{
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(),
data.data(), 0, 0, 0);
data::SparsePageDMatrix dmat(
&csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
EXPECT_EQ(dmat.Info().num_row_, 0);
EXPECT_EQ(dmat.Info().num_col_, 0);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
{
data::DenseAdapter dense_adapter(nullptr, 0, 0);
data::SparsePageDMatrix dmat2(
&dense_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
EXPECT_EQ(dmat2.Info().num_row_, 0);
EXPECT_EQ(dmat2.Info().num_col_, 0);
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
{
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
data::SparsePageDMatrix dmat3(
&csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
EXPECT_EQ(dmat3.Info().num_row_, 0);
EXPECT_EQ(dmat3.Info().num_col_, 0);
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
}
TEST(SparsePageDMatrix, MissingData) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{0.0, std::nanf(""), 1.0};
std::vector<unsigned> feature_idx = {0, 1, 0};
std::vector<size_t> row_ptr = {0, 2, 3};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
3, 2);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
const std::string tmp_file2 = tempdir.path + "/simple2.libsvm";
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1, tmp_file2);
EXPECT_EQ(dmat2.Info().num_nonzero_, 1);
}
TEST(SparsePageDMatrix, EmptyRow) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{0.0, 1.0};
std::vector<unsigned> feature_idx = {0, 1};
std::vector<size_t> row_ptr = {0, 2, 2};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
2, 2);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
EXPECT_EQ(dmat.Info().num_row_, 2);
EXPECT_EQ(dmat.Info().num_col_, 2);
}
TEST(SparsePageDMatrix, FromDense) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
int m = 3;
int n = 2;
std::vector<float> data = {1, 2, 3, 4, 5, 6};
data::DenseAdapter adapter(data.data(), m, n);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, data[i * n + j]);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SparsePageDMatrix, FromCSC) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data = {1, 3, 2, 4, 5};
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
std::vector<size_t> col_ptr = {0, 2, 5};
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
auto &batch = *dmat.GetBatches<SparsePage>().begin();
auto page = batch.GetView();
auto inst = page[0];
EXPECT_EQ(inst[0].fvalue, 1);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 2);
EXPECT_EQ(inst[1].index, 1);
inst = page[1];
EXPECT_EQ(inst[0].fvalue, 3);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 4);
EXPECT_EQ(inst[1].index, 1);
inst = page[2];
EXPECT_EQ(inst[0].fvalue, 5);
EXPECT_EQ(inst[0].index, 1);
}
TEST(SparsePageDMatrix, FromFile) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 20);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1);
ASSERT_EQ(dmat.Info().num_col_, 5);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
std::vector<bst_row_t> expected_offset(batch.Size() + 1);
auto page = batch.GetView();
int n = -3;
std::generate(expected_offset.begin(), expected_offset.end(),
[&n] { return n += 3; });
EXPECT_EQ(batch.offset.HostVector(), expected_offset);
if (batch.base_rowid % 2 == 0) {
EXPECT_EQ(page[0][0].index, 0);
EXPECT_EQ(page[0][1].index, 1);
EXPECT_EQ(page[0][2].index, 2);
} else {
EXPECT_EQ(page[0][0].index, 0);
EXPECT_EQ(page[0][1].index, 3);
EXPECT_EQ(page[0][2].index, 4);
}
}
}
TEST(SparsePageDMatrix, Large) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 1 << 16);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 16)};
std::unique_ptr<DMatrix> simple{DMatrix::Load(filename, true, true)};
std::vector<float> sparse_data;
std::vector<size_t> sparse_rptr;
std::vector<bst_feature_t> sparse_cids;
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
std::vector<float> simple_data;
std::vector<size_t> simple_rptr;
std::vector<bst_feature_t> simple_cids;
DMatrixToCSR(simple.get(), &simple_data, &simple_rptr, &simple_cids);
ASSERT_EQ(sparse_rptr.size(), sparse->Info().num_row_ + 1);
ASSERT_EQ(sparse_rptr.size(), simple->Info().num_row_ + 1);
ASSERT_EQ(sparse_data.size(), simple_data.size());
ASSERT_EQ(sparse_data, simple_data);
ASSERT_EQ(sparse_rptr.size(), simple_rptr.size());
ASSERT_EQ(sparse_rptr, simple_rptr);
ASSERT_EQ(sparse_cids, simple_cids);
}
auto TestSparsePageDMatrixDeterminism(int32_t threads, std::string const& filename) {
omp_set_num_threads(threads); omp_set_num_threads(threads);
std::vector<float> sparse_data; std::vector<float> sparse_data;
std::vector<size_t> sparse_rptr; std::vector<size_t> sparse_rptr;
std::vector<bst_feature_t> sparse_cids; std::vector<bst_feature_t> sparse_cids;
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir; dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm"; std::string filename = tempdir.path + "/simple.libsvm";
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix( CreateBigTestData(filename, 1 << 16);
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1 << 8)};
data::FileIterator iter(filename, 0, 1, "auto");
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(), 1, filename}};
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids); DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
std::string cache_name = tmp_file + ".row.page"; auto cache_name =
data::MakeId(filename,
dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) +
".row.page";
std::string cache = common::LoadSequentialFile(cache_name); std::string cache = common::LoadSequentialFile(cache_name);
return cache; return cache;
} }
TEST(SparsePageDMatrix, Determinism) { TEST(SparsePageDMatrix, Determinism) {
std::string filename = "test.libsvm"; #if defined(_MSC_VER)
CreateBigTestData(filename, 1 << 16); return;
#endif // defined(_MSC_VER)
std::vector<std::string> caches; std::vector<std::string> caches;
for (size_t i = 1; i < 18; i += 2) { for (size_t i = 1; i < 18; i += 2) {
caches.emplace_back(TestSparsePageDMatrixDeterminism(i, filename)); caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
} }
for (size_t i = 1; i < caches.size(); ++i) { for (size_t i = 1; i < caches.size(); ++i) {

View File

@ -4,6 +4,7 @@
#include "../helpers.h" #include "../helpers.h"
#include "../../../src/common/compressed_iterator.h" #include "../../../src/common/compressed_iterator.h"
#include "../../../src/data/ellpack_page.cuh" #include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/sparse_page_dmatrix.h"
namespace xgboost { namespace xgboost {
@ -14,13 +15,22 @@ TEST(SparsePageDMatrix, EllpackPage) {
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false); DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
// Loop over the batches and assert the data is as expected // Loop over the batches and assert the data is as expected
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 64})) { size_t n = 0;
EXPECT_EQ(batch.Size(), dmat->Info().num_row_); for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
n += batch.Size();
} }
EXPECT_EQ(n, dmat->Info().num_row_);
EXPECT_TRUE(FileExists(tmp_file + ".cache")); auto path =
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); data::MakeId(tmp_file + ".cache",
EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page")); dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
".row.page";
EXPECT_TRUE(FileExists(path));
path =
data::MakeId(tmp_file + ".cache",
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
".ellpack.page";
EXPECT_TRUE(FileExists(path));
delete dmat; delete dmat;
} }
@ -30,12 +40,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
std::string filename = tmpdir.path + "/big.libsvm"; std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3; size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename); std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, filename);
// Loop over the batches and count the records // Loop over the batches and count the records
int64_t batch_count = 0; int64_t batch_count = 0;
int64_t row_count = 0; int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 7UL})) { for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_); EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++; batch_count++;
row_count += batch.Size(); row_count += batch.Size();
@ -43,7 +53,36 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
EXPECT_GE(batch_count, 2); EXPECT_GE(batch_count, 2);
EXPECT_EQ(row_count, dmat->Info().num_row_); EXPECT_EQ(row_count, dmat->Info().num_row_);
EXPECT_TRUE(FileExists(filename + ".cache.ellpack.page")); auto path =
data::MakeId(filename,
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
".ellpack.page";
}
TEST(SparsePageDMatrix, RetainEllpackPage) {
auto m = CreateSparsePageDMatrix(10000);
auto batches = m->GetBatches<EllpackPage>({0, 32});
auto begin = batches.begin();
auto end = batches.end();
std::vector<HostDeviceVector<common::CompressedByteT>> gidx_buffers;
std::vector<std::shared_ptr<EllpackPage const>> iterators;
for (auto it = begin; it != end; ++it) {
iterators.push_back(it.Page());
gidx_buffers.emplace_back(HostDeviceVector<common::CompressedByteT>{});
gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
}
ASSERT_GE(iterators.size(), 2);
for (size_t i = 0; i < iterators.size(); ++i) {
ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
}
} }
TEST(SparsePageDMatrix, EllpackPageContent) { TEST(SparsePageDMatrix, EllpackPageContent) {
@ -59,7 +98,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
std::unique_ptr<DMatrix> std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 2, 0}; BatchParam param{0, 2};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl(); auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl->base_rowid, 0); EXPECT_EQ(impl->base_rowid, 0);
EXPECT_EQ(impl->n_rows, kRows); EXPECT_EQ(impl->n_rows, kRows);
@ -67,7 +106,17 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
EXPECT_EQ(impl->row_stride, 2); EXPECT_EQ(impl->row_stride, 2);
EXPECT_EQ(impl->Cuts().TotalBins(), 4); EXPECT_EQ(impl->Cuts().TotalBins(), 4);
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl(); std::unique_ptr<EllpackPageImpl> impl_ext;
size_t offset = 0;
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
if (!impl_ext) {
impl_ext.reset(new EllpackPageImpl(
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
}
auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
offset += n_elems;
}
EXPECT_EQ(impl_ext->base_rowid, 0); EXPECT_EQ(impl_ext->base_rowid, 0);
EXPECT_EQ(impl_ext->n_rows, kRows); EXPECT_EQ(impl_ext->n_rows, kRows);
EXPECT_FALSE(impl_ext->is_dense); EXPECT_FALSE(impl_ext->is_dense);
@ -109,7 +158,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
std::unique_ptr<DMatrix> std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins, kPageSize}; BatchParam param{0, kMaxBins};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl(); auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl->base_rowid, 0); EXPECT_EQ(impl->base_rowid, 0);
EXPECT_EQ(impl->n_rows, kRows); EXPECT_EQ(impl->n_rows, kRows);
@ -150,7 +199,7 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
std::unique_ptr<DMatrix> std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins, kPageSize}; BatchParam param{0, kMaxBins};
size_t current_row = 0; size_t current_row = 0;
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) { for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {

View File

@ -155,7 +155,8 @@ TEST(GBTree, ChoosePredictor) {
ASSERT_TRUE(data.HostCanWrite()); ASSERT_TRUE(data.HostCanWrite());
// pull data into device. // pull data into device.
data = HostDeviceVector<Entry>(data.HostVector(), 0); data.HostVector();
data.SetDevice(0);
data.DeviceSpan(); data.DeviceSpan();
ASSERT_FALSE(data.HostCanWrite()); ASSERT_FALSE(data.HostCanWrite());

View File

@ -18,6 +18,7 @@
#include "xgboost/c_api.h" #include "xgboost/c_api.h"
#include "../../src/data/adapter.h" #include "../../src/data/adapter.h"
#include "../../src/data/simple_dmatrix.h" #include "../../src/data/simple_dmatrix.h"
#include "../../src/data/sparse_page_dmatrix.h"
#include "../../src/gbm/gbtree_model.h" #include "../../src/gbm/gbtree_model.h"
#include "xgboost/predictor.h" #include "xgboost/predictor.h"
@ -45,12 +46,25 @@ void CreateSimpleTestData(const std::string& filename) {
CreateBigTestData(filename, 6); CreateBigTestData(filename, 6);
} }
void CreateBigTestData(const std::string& filename, size_t n_entries) { void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based) {
std::ofstream fo(filename.c_str()); std::ofstream fo(filename.c_str());
const size_t entries_per_row = 3; const size_t entries_per_row = 3;
std::string odd_row;
if (zero_based) {
odd_row = " 0:0 3:30 4:40\n";
} else {
odd_row = " 1:0 4:30 5:40\n";
}
std::string even_row;
if (zero_based) {
even_row = " 0:0 1:10 2:20\n";
} else {
even_row = " 1:0 2:10 3:20\n";
}
size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row; size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row;
for (size_t i = 0; i < n_rows; ++i) { for (size_t i = 0; i < n_rows; ++i) {
const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n"; auto row = i % 2 == 0 ? even_row : odd_row;
fo << i << row; fo << i << row;
} }
} }
@ -348,13 +362,20 @@ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
&adapter, std::numeric_limits<float>::quiet_NaN(), 1)); &adapter, std::numeric_limits<float>::quiet_NaN(), 1));
} }
std::unique_ptr<DMatrix> CreateSparsePageDMatrix( std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries,
size_t n_entries, size_t page_size, std::string tmp_file) { std::string prefix) {
// Create sufficiently large data to make two row pages size_t n_columns = 3;
CreateBigTestData(tmp_file, n_entries); size_t n_rows = n_entries / n_columns;
std::unique_ptr<DMatrix> dmat { DMatrix::Load( ArrayIterForTest iter(0, n_rows, n_columns, 2);
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); std::unique_ptr<DMatrix> dmat{DMatrix::Create(
static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 1, prefix)};
auto row_page_path =
data::MakeId(prefix,
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
".row.page";
EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
// Loop over the batches and count the records // Loop over the batches and count the records
int64_t batch_count = 0; int64_t batch_count = 0;
@ -368,7 +389,6 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
return dmat; return dmat;
} }
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC( std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic, size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
const dmlc::TemporaryDirectory& tempdir) { const dmlc::TemporaryDirectory& tempdir) {
@ -432,7 +452,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
uri += "#" + tmp_file + ".cache"; uri += "#" + tmp_file + ".cache";
} }
std::unique_ptr<DMatrix> dmat( std::unique_ptr<DMatrix> dmat(
DMatrix::Load(uri, true, false, "auto", page_size)); DMatrix::Load(uri, true, false, "auto"));
return dmat; return dmat;
} }
@ -481,6 +501,28 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
return gbm; return gbm;
} }
ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols,
size_t batches) : rows_{rows}, cols_{cols}, n_batches_{batches} {
XGProxyDMatrixCreate(&proxy_);
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
std::tie(batches_, interface_) =
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
}
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
int ArrayIterForTest::Next() {
if (iter_ == n_batches_) {
return 0;
}
XGProxyDMatrixSetDataDense(proxy_, batches_[iter_].c_str());
iter_++;
return 1;
}
size_t constexpr ArrayIterForTest::kRows;
size_t constexpr ArrayIterForTest::kCols;
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data, void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
std::vector<size_t> *p_row_ptr, std::vector<size_t> *p_row_ptr,
std::vector<bst_feature_t> *p_cids) { std::vector<bst_feature_t> *p_cids) {

View File

@ -8,16 +8,16 @@ namespace xgboost {
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
size_t cols, size_t batches) size_t cols, size_t batches)
: rows_{rows}, cols_{cols}, n_batches_{batches} { : ArrayIterForTest{sparsity, rows, cols, batches} {
XGProxyDMatrixCreate(&proxy_);
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
rng_->Device(0); rng_->Device(0);
std::tie(batches_, interface_) = std::tie(batches_, interface_) =
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
this->Reset(); this->Reset();
} }
CudaArrayIterForTest::~CudaArrayIterForTest() { XGDMatrixFree(proxy_); } size_t constexpr CudaArrayIterForTest::kRows;
size_t constexpr CudaArrayIterForTest::kCols;
size_t constexpr CudaArrayIterForTest::kBatches;
int CudaArrayIterForTest::Next() { int CudaArrayIterForTest::Next() {
if (iter_ == n_batches_) { if (iter_ == n_batches_) {
@ -28,8 +28,6 @@ int CudaArrayIterForTest::Next() {
return 1; return 1;
} }
size_t constexpr CudaArrayIterForTest::kRows;
size_t constexpr CudaArrayIterForTest::kCols;
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label, std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label,
bool float_label, bool float_label,

View File

@ -55,7 +55,9 @@ int64_t GetFileSize(const std::string& filename);
void CreateSimpleTestData(const std::string& filename); void CreateSimpleTestData(const std::string& filename);
void CreateBigTestData(const std::string& filename, size_t n_entries); // Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
// 0-based indexing.
void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj, void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
std::vector<xgboost::bst_float> preds, std::vector<xgboost::bst_float> preds,
@ -300,8 +302,7 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x, std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
int num_rows, int num_columns); int num_rows, int num_columns);
std::unique_ptr<DMatrix> CreateSparsePageDMatrix( std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
size_t n_entries, size_t page_size, std::string tmp_file);
/** /**
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols, * \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
@ -356,7 +357,8 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
typedef void *DMatrixHandle; // NOLINT(*); typedef void *DMatrixHandle; // NOLINT(*);
class CudaArrayIterForTest { class ArrayIterForTest {
protected:
HostDeviceVector<float> data_; HostDeviceVector<float> data_;
size_t iter_ {0}; size_t iter_ {0};
DMatrixHandle proxy_; DMatrixHandle proxy_;
@ -373,20 +375,32 @@ class CudaArrayIterForTest {
size_t static constexpr kBatches { 100 }; size_t static constexpr kBatches { 100 };
size_t static constexpr kCols { 13 }; size_t static constexpr kCols { 13 };
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
size_t cols = kCols, size_t batches = kBatches);
~CudaArrayIterForTest();
std::string AsArray() const { std::string AsArray() const {
return interface_; return interface_;
} }
int Next(); virtual int Next();
void Reset() { virtual void Reset() {
iter_ = 0; iter_ = 0;
} }
size_t Iter() const { return iter_; } size_t Iter() const { return iter_; }
auto Proxy() -> decltype(proxy_) { return proxy_; } auto Proxy() -> decltype(proxy_) { return proxy_; }
explicit ArrayIterForTest(float sparsity, size_t rows = kRows,
size_t cols = kCols, size_t batches = kBatches);
virtual ~ArrayIterForTest();
};
class CudaArrayIterForTest : public ArrayIterForTest {
public:
size_t static constexpr kRows{1000};
size_t static constexpr kBatches{100};
size_t static constexpr kCols{13};
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
size_t cols = kCols, size_t batches = kBatches);
int Next() override;
~CudaArrayIterForTest() override = default;
}; };
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data, void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
@ -396,11 +410,11 @@ void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
typedef void *DataIterHandle; // NOLINT(*) typedef void *DataIterHandle; // NOLINT(*)
inline void Reset(DataIterHandle self) { inline void Reset(DataIterHandle self) {
static_cast<CudaArrayIterForTest*>(self)->Reset(); static_cast<ArrayIterForTest*>(self)->Reset();
} }
inline int Next(DataIterHandle self) { inline int Next(DataIterHandle self) {
return static_cast<CudaArrayIterForTest*>(self)->Next(); return static_cast<ArrayIterForTest*>(self)->Next();
} }
class RMMAllocator; class RMMAllocator;

View File

@ -92,13 +92,10 @@ TEST(CpuPredictor, IterationRange) {
} }
TEST(CpuPredictor, ExternalMemory) { TEST(CpuPredictor, ExternalMemory) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3; size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename); std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
auto lparam = CreateEmptyGenericParam(GPUIDX); auto lparam = CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Predictor> cpu_predictor = std::unique_ptr<Predictor> cpu_predictor =

View File

@ -102,13 +102,10 @@ TEST(GPUPredictor, ExternalMemoryTest) {
gbm::GBTreeModel model = CreateTestModel(&param, n_classes); gbm::GBTreeModel model = CreateTestModel(&param, n_classes);
std::vector<std::unique_ptr<DMatrix>> dmats; std::vector<std::unique_ptr<DMatrix>> dmats;
dmlc::TemporaryDirectory tmpdir;
std::string file0 = tmpdir.path + "/big_0.libsvm"; dmats.push_back(CreateSparsePageDMatrix(400));
std::string file1 = tmpdir.path + "/big_1.libsvm"; dmats.push_back(CreateSparsePageDMatrix(800));
std::string file2 = tmpdir.path + "/big_2.libsvm"; dmats.push_back(CreateSparsePageDMatrix(8000));
dmats.push_back(CreateSparsePageDMatrix(400, 64UL, file0));
dmats.push_back(CreateSparsePageDMatrix(800, 128UL, file1));
dmats.push_back(CreateSparsePageDMatrix(8000, 1024UL, file2));
for (const auto& dmat: dmats) { for (const auto& dmat: dmats) {
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5); dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);

View File

@ -98,8 +98,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT
const std::string tmp_file = tempdir.path + "/big.libsvm"; const std::string tmp_file = tempdir.path + "/big.libsvm";
CreateBigTestData(tmp_file, 50000); CreateBigTestData(tmp_file, 50000);
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load( std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 100)); tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_FALSE(dmat->SingleColBlock()); EXPECT_FALSE(dmat->SingleColBlock());
size_t num_row = dmat->Info().num_row_; size_t num_row = dmat->Info().num_row_;
std::vector<bst_float> labels(num_row); std::vector<bst_float> labels(num_row);

View File

@ -27,7 +27,7 @@ void VerifySampling(size_t page_size,
} }
gpair.SetDevice(0); gpair.SetDevice(0);
BatchParam param{0, 256, page_size}; BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
if (page_size != 0) { if (page_size != 0) {
EXPECT_NE(page->n_rows, kRows); EXPECT_NE(page->n_rows, kRows);
@ -82,7 +82,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
auto gpair = GenerateRandomGradients(kRows); auto gpair = GenerateRandomGradients(kRows);
gpair.SetDevice(0); gpair.SetDevice(0);
BatchParam param{0, 256, kPageSize}; BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_NE(page->n_rows, kRows); EXPECT_NE(page->n_rows, kRows);

View File

@ -15,7 +15,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
float sparsity = is_dense ? 0.0f : 0.5f; float sparsity = is_dense ? 0.0f : 0.5f;
auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(); auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix();
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0}; BatchParam batch_param{0, static_cast<int32_t>(kBins)};
for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) { for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) {
auto* page = batch.Impl(); auto* page = batch.Impl();
@ -116,7 +116,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories); auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
auto cat_m = GetDMatrixFromData(x, kRows, 1); auto cat_m = GetDMatrixFromData(x, kRows, 1);
cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0}; BatchParam batch_param{0, static_cast<int32_t>(kBins)};
tree::RowPartitioner row_partitioner(0, kRows); tree::RowPartitioner row_partitioner(0, kRows);
auto ridx = row_partitioner.GetRows(0); auto ridx = row_partitioner.GetRows(0);
dh::device_vector<GradientPairPrecise> cat_hist(num_categories); dh::device_vector<GradientPairPrecise> cat_hist(num_categories);

View File

@ -152,7 +152,6 @@ TEST(GpuHist, ApplySplit) {
BatchParam bparam; BatchParam bparam;
bparam.gpu_id = 0; bparam.gpu_id = 0;
bparam.max_bin = 3; bparam.max_bin = 3;
bparam.gpu_page_size = 0;
for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){ for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
auto impl = ellpack.Impl(); auto impl = ellpack.Impl();
@ -291,9 +290,13 @@ void TestHistogramIndexImpl() {
// Extract the device maker from the histogram makers and from that its compressed // Extract the device maker from the histogram makers and from that its compressed
// histogram index // histogram index
const auto &maker = hist_maker.maker; const auto &maker = hist_maker.maker;
auto grad = GenerateRandomGradients(kNRows);
grad.SetDevice(0);
maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector()); std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
const auto &maker_ext = hist_maker_ext.maker; const auto &maker_ext = hist_maker_ext.maker;
maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector()); std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins()); ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
@ -365,7 +368,7 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
// Loop over the batches and count the records // Loop over the batches and count the records
int64_t batch_count = 0; int64_t batch_count = 0;
int64_t row_count = 0; int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin, gpu_page_size})) { for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin})) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_); EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++; batch_count++;
row_count += batch.Size(); row_count += batch.Size();
@ -386,7 +389,6 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker; tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker;
GenericParameter generic_param(CreateEmptyGenericParam(0)); GenericParameter generic_param(CreateEmptyGenericParam(0));
generic_param.gpu_page_size = gpu_page_size;
hist_maker.Configure(args, &generic_param); hist_maker.Configure(args, &generic_param);
hist_maker.Update(gpair, dmat, {tree}); hist_maker.Update(gpair, dmat, {tree});