From bd1f3a38f03e8131a4eca9890bb98851f0751c1a Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 16 Jul 2021 12:33:31 +0800 Subject: [PATCH] Rewrite sparse dmatrix using callbacks. (#7092) - Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves. - Remove use of threaded iterator and IO queue. - Remove `page_size`. - Make sure the number of pages in memory is bounded. - Make sure the cache can not be violated. - Provide an interface for internal algorithms to process data asynchronously. --- amalgamation/xgboost-all0.cc | 8 +- include/xgboost/c_api.h | 115 ++- include/xgboost/data.h | 68 +- include/xgboost/generic_parameters.h | 6 - src/c_api/c_api.cc | 57 +- src/common/quantile.cu | 1 - src/data/data.cc | 149 ++-- src/data/data.cu | 6 +- src/data/ellpack_page.cu | 15 +- src/data/ellpack_page.cuh | 4 +- src/data/ellpack_page_source.cc | 24 - src/data/ellpack_page_source.cu | 85 +- src/data/ellpack_page_source.h | 67 +- src/data/file_iterator.h | 115 +++ src/data/iterative_device_dmatrix.cu | 4 +- src/data/iterative_device_dmatrix.h | 16 +- src/data/proxy_dmatrix.h | 2 +- src/data/simple_batch_iterator.h | 22 +- src/data/simple_dmatrix.cc | 32 +- src/data/simple_dmatrix.cu | 2 +- src/data/simple_dmatrix.h | 13 +- src/data/sparse_page_dmatrix.cc | 136 ++- src/data/sparse_page_dmatrix.cu | 46 ++ src/data/sparse_page_dmatrix.h | 127 ++- src/data/sparse_page_source.cc | 77 -- src/data/sparse_page_source.cu | 17 + src/data/sparse_page_source.h | 781 +++++++----------- src/data/sparse_page_writer.h | 97 --- src/tree/gpu_hist/gradient_based_sampler.cu | 16 +- src/tree/gpu_hist/gradient_based_sampler.cuh | 26 +- src/tree/updater_gpu_hist.cu | 7 +- tests/cpp/common/test_column_matrix.cc | 4 +- tests/cpp/common/test_hist_util.h | 2 +- tests/cpp/data/test_data.cc | 17 +- tests/cpp/data/test_ellpack_page.cu | 6 +- tests/cpp/data/test_file_iterator.cc | 46 ++ .../cpp/data/test_iterative_device_dmatrix.cu | 2 +- tests/cpp/data/test_metainfo.cc | 2 +- tests/cpp/data/test_metainfo.cu | 2 +- tests/cpp/data/test_sparse_page_dmatrix.cc | 391 +++------ tests/cpp/data/test_sparse_page_dmatrix.cu | 73 +- tests/cpp/gbm/test_gbtree.cc | 3 +- tests/cpp/helpers.cc | 64 +- tests/cpp/helpers.cu | 10 +- tests/cpp/helpers.h | 38 +- tests/cpp/predictor/test_cpu_predictor.cc | 5 +- tests/cpp/predictor/test_gpu_predictor.cu | 11 +- tests/cpp/test_learner.cc | 3 +- .../gpu_hist/test_gradient_based_sampler.cu | 4 +- tests/cpp/tree/gpu_hist/test_histogram.cu | 4 +- tests/cpp/tree/test_gpu_hist.cu | 8 +- 51 files changed, 1445 insertions(+), 1391 deletions(-) delete mode 100644 src/data/ellpack_page_source.cc create mode 100644 src/data/file_iterator.h create mode 100644 src/data/sparse_page_dmatrix.cu delete mode 100644 src/data/sparse_page_source.cc create mode 100644 src/data/sparse_page_source.cu create mode 100644 tests/cpp/data/test_file_iterator.cc diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index 16650d215..7a245ebfe 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -37,18 +37,14 @@ #include "../src/data/simple_dmatrix.cc" #include "../src/data/sparse_page_raw_format.cc" #include "../src/data/ellpack_page.cc" -#include "../src/data/ellpack_page_source.cc" #include "../src/data/gradient_index.cc" +#include "../src/data/sparse_page_dmatrix.cc" +#include "../src/data/proxy_dmatrix.cc" // prediction #include "../src/predictor/predictor.cc" #include "../src/predictor/cpu_predictor.cc" -#if DMLC_ENABLE_STD_THREAD -#include "../src/data/sparse_page_dmatrix.cc" -#include "../src/data/sparse_page_source.cc" -#endif - // trees #include "../src/tree/param.cc" #include "../src/tree/tree_model.cc" diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index f367693a0..9f5f21acf 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -223,19 +223,31 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, * - XGBCallbackDataIterNext * - XGDMatrixCreateFromDataIter * - * Another set is used by Quantile based DMatrix (used by hist algorithm) for reducing - * memory usage. Currently only GPU implementation is available. It accept foreign data - * iterators as callbacks and works similar to external memory. For GPU Hist, the data is - * first compressed by quantile sketching then merged. This is particular useful for - * distributed setting as it eliminates 2 copies of data. 1 by a `concat` from external - * library to make the data into a blob for normal DMatrix initialization, another by the - * internal CSR copy of DMatrix. Related functions are: + * Another set is used by external data iterator. It accept foreign data iterators as + * callbacks. There are 2 different senarios where users might want to pass in callbacks + * instead of raw data. First it's the Quantile DMatrix used by GPU Hist. For this case, + * the data is first compressed by quantile sketching then merged. This is particular + * useful for distributed setting as it eliminates 2 copies of data. 1 by a `concat` from + * external library to make the data into a blob for normal DMatrix initialization, + * another by the internal CSR copy of DMatrix. The second use case is external memory + * support where users can pass a custom data iterator into XGBoost for loading data in + * batches. There are short notes on each of the use case in respected DMatrix factory + * function. * + * Related functions are: + * + * # Factory functions + * - `XGDMatrixCreateFromCallback` for external memory + * - `XGDeviceQuantileDMatrixCreateFromCallback` for quantile DMatrix + * + * # Proxy that callers can use to pass data to XGBoost * - XGProxyDMatrixCreate * - XGDMatrixCallbackNext * - DataIterResetCallback * - XGProxyDMatrixSetDataCudaArrayInterface * - XGProxyDMatrixSetDataCudaColumnar + * - XGProxyDMatrixSetDataDense + * - XGProxyDMatrixSetDataCSR * - ... (data setters) */ @@ -308,17 +320,9 @@ XGB_DLL int XGDMatrixCreateFromDataIter( const char* cache_info, DMatrixHandle *out); -/* == Second set of callback functions, used by constructing Quantile based DMatrix. === - * - * Short note for how to use the second set of callback for GPU Hist tree method. - * - * Step 0: Define a data iterator with 2 methods `reset`, and `next`. - * Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle. - * Step 2: Pass the iterator handle, proxy handle and 2 methods into - * `XGDeviceQuantileDMatrixCreateFromCallback`. - * Step 3: Call appropriate data setters in `next` functions. - * - * See test_iterative_device_dmatrix.cu or Python interface for examples. +/** + * Second set of callback functions, used by constructing Quantile DMatrix or external + * memory DMatrix using custom iterator. */ /*! @@ -344,8 +348,53 @@ XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter); // NOLINT( */ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*) + /*! - * \brief Create a device DMatrix with data iterator. + * \brief Create an external memory DMatrix with data iterator. + * + * Short note for how to use second set of callback for external memory data support: + * + * - Step 0: Define a data iterator with 2 methods `reset`, and `next`. + * - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle. + * - Step 2: Pass the iterator handle, proxy handle and 2 methods into + * `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object. + * - Step 3: Call appropriate data setters in `next` functions. + * + * For example usage see demo/c-api/external-memory + * + * \param iter A handle to external data iterator. + * \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`. + * \param reset Callback function resetting the iterator state. + * \param next Callback function yielding the next batch of data. + * \param c_json_config JSON encoded parameters for DMatrix construction. Accepted fields are: + * + * - missing: Which value to represent missing value + * - cache_prefix: The path of cache file, caller must initialize all the directories in this path. + * - nthread (optional): Number of threads used for initializing DMatrix. + * + * \param out The created external memory DMatrix + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, + DMatrixHandle proxy, + DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, + char const* c_json_config, + DMatrixHandle *out); + +/*! + * \brief Create a Quantile DMatrix with data iterator. + * + * Short note for how to use the second set of callback for GPU Hist tree method: + * + * - Step 0: Define a data iterator with 2 methods `reset`, and `next`. + * - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle. + * - Step 2: Pass the iterator handle, proxy handle and 2 methods into + * `XGDeviceQuantileDMatrixCreateFromCallback`. + * - Step 3: Call appropriate data setters in `next` functions. + * + * See test_iterative_device_dmatrix.cu or Python interface for examples. * * \param iter A handle to external data iterator. * \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`. @@ -362,6 +411,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback( DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin, DMatrixHandle *out); + /*! * \brief Set data on a DMatrix proxy. * @@ -387,6 +437,33 @@ XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle, XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, const char *c_interface_str); +/*! + * \brief Set data on a DMatrix proxy. + * + * \param handle A DMatrix proxy created by XGProxyDMatrixCreate + * \param c_interface_str Null terminated JSON document string representation of array + * interface. + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, + char const *c_interface_str); + +/*! + * \brief Set data on a DMatrix proxy. + * + * \param handle A DMatrix proxy created by XGProxyDMatrixCreate + * \param indptr JSON encoded __array_interface__ to row pointer in CSR. + * \param indices JSON encoded __array_interface__ to column indices in CSR. + * \param values JSON encoded __array_interface__ to values in CSR.. + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, + char const *indices, char const *data, + bst_ulong ncol); + + /* * ==========================- End data callback APIs ========================== */ diff --git a/include/xgboost/data.h b/include/xgboost/data.h index ada292e42..3a403d541 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -171,9 +171,12 @@ class MetaInfo { * \param that The other MetaInfo object. * * \param accumulate_rows Whether rows need to be accumulated in this function. If - * client code knows number of rows in advance, set this parameter to false. + * client code knows number of rows in advance, set this + * parameter to false. + * \param check_column Whether the extend method should check the consistency of + * columns. */ - void Extend(MetaInfo const& that, bool accumulate_rows); + void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column); private: /*! \brief argsort of labels */ @@ -211,14 +214,12 @@ struct BatchParam { int gpu_id; /*! \brief Maximum number of bins per feature for histograms. */ int max_bin{0}; - /*! \brief Page size for external memory mode. */ - size_t gpu_page_size; BatchParam() = default; - BatchParam(int32_t device, int32_t max_bin, size_t gpu_page_size = 0) - : gpu_id{device}, max_bin{max_bin}, gpu_page_size{gpu_page_size} {} - inline bool operator!=(const BatchParam& other) const { - return gpu_id != other.gpu_id || max_bin != other.max_bin || - gpu_page_size != other.gpu_page_size; + BatchParam(int32_t device, int32_t max_bin) + : gpu_id{device}, max_bin{max_bin} {} + + bool operator!=(const BatchParam& other) const { + return gpu_id != other.gpu_id || max_bin != other.max_bin; } }; @@ -390,11 +391,12 @@ class GHistIndexMatrix; template class BatchIteratorImpl { public: + using iterator_category = std::forward_iterator_tag; // NOLINT virtual ~BatchIteratorImpl() = default; - virtual T& operator*() = 0; virtual const T& operator*() const = 0; - virtual void operator++() = 0; + virtual BatchIteratorImpl& operator++() = 0; virtual bool AtEnd() const = 0; + virtual std::shared_ptr Page() const = 0; }; template @@ -402,15 +404,12 @@ class BatchIterator { public: using iterator_category = std::forward_iterator_tag; // NOLINT explicit BatchIterator(BatchIteratorImpl* impl) { impl_.reset(impl); } + explicit BatchIterator(std::shared_ptr> impl) { impl_ = impl; } - void operator++() { + BatchIterator &operator++() { CHECK(impl_ != nullptr); ++(*impl_); - } - - T& operator*() { - CHECK(impl_ != nullptr); - return *(*impl_); + return *this; } const T& operator*() const { @@ -428,6 +427,10 @@ class BatchIterator { return impl_->AtEnd(); } + std::shared_ptr Page() const { + return impl_->Page(); + } + private: std::shared_ptr> impl_; }; @@ -499,8 +502,7 @@ class DMatrix { static DMatrix* Load(const std::string& uri, bool silent, bool load_row_split, - const std::string& file_format = "auto", - size_t page_size = kPageSize); + const std::string& file_format = "auto"); /** * \brief Creates a new DMatrix from an external data adapter. @@ -516,8 +518,7 @@ class DMatrix { */ template static DMatrix* Create(AdapterT* adapter, float missing, int nthread, - const std::string& cache_prefix = "", - size_t page_size = kPageSize); + const std::string& cache_prefix = ""); /** * \brief Create a new Quantile based DMatrix used for histogram based algorithm. @@ -545,6 +546,31 @@ class DMatrix { int nthread, int max_bin); + /** + * \brief Create an external memory DMatrix with callbacks. + * + * \tparam DataIterHandle External iterator type, defined in C API. + * \tparam DMatrixHandle DMatrix handle, defined in C API. + * \tparam DataIterResetCallback Callback for reset, prototype defined in C API. + * \tparam XGDMatrixCallbackNext Callback for next, prototype defined in C API. + * + * \param iter External data iterator + * \param proxy A hanlde to ProxyDMatrix + * \param reset Callback for reset + * \param next Callback for next + * \param missing Value that should be treated as missing. + * \param nthread number of threads used for initialization. + * \param cache Prefix of cache file path. + * + * \return A created external memory DMatrix. + */ + template + static DMatrix *Create(DataIterHandle iter, DMatrixHandle proxy, + DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, float missing, + int32_t nthread, std::string cache); + virtual DMatrix *Slice(common::Span ridxs) = 0; /*! \brief Number of rows per page in external memory. Approximately 100MB per page for * dataset with 100 features. */ diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index fd74abf2c..583252bcc 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -29,8 +29,6 @@ struct GenericParameter : public XGBoostParameter { int gpu_id; // fail when gpu_id is invalid bool fail_on_invalid_gpu_id {false}; - // gpu page size in external memory mode, 0 means using the default. - size_t gpu_page_size; bool validate_parameters {false}; /*! @@ -66,10 +64,6 @@ struct GenericParameter : public XGBoostParameter { DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id) .set_default(false) .describe("Fail with error when gpu_id is invalid."); - DMLC_DECLARE_FIELD(gpu_page_size) - .set_default(0) - .set_lower_bound(0) - .describe("GPU page size when running in external memory mode."); DMLC_DECLARE_FIELD(validate_parameters) .set_default(false) .describe("Enable checking whether parameters are used or not."); diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 85618333c..7f0ab7735 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -190,6 +190,35 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs, #endif // Create from data iterator +XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, + DMatrixHandle proxy, + DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, + char const* c_json_config, + DMatrixHandle *out) { + API_BEGIN(); + auto config = Json::Load(StringView{c_json_config}); + float missing = get(config["missing"]); + std::string cache = get(config["cache_prefix"]); + int32_t n_threads = omp_get_max_threads(); + if (!IsA(config["nthread"])) { + n_threads = get(config["nthread"]); + } + *out = new std::shared_ptr{xgboost::DMatrix::Create( + iter, proxy, reset, next, missing, n_threads, cache)}; + API_END(); +} + +XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback( + DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, float missing, int nthread, + int max_bin, DMatrixHandle *out) { + API_BEGIN(); + *out = new std::shared_ptr{ + xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)}; + API_END(); +} + XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) { API_BEGIN(); *out = new std::shared_ptr(new xgboost::data::DMatrixProxy);; @@ -221,15 +250,31 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, API_END(); } -XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback( - DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, - XGDMatrixCallbackNext *next, float missing, int nthread, - int max_bin, DMatrixHandle *out) { +XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, + char const *c_interface_str) { API_BEGIN(); - *out = new std::shared_ptr{ - xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)}; + CHECK_HANDLE(); + auto p_m = static_cast *>(handle); + CHECK(p_m); + auto m = static_cast(p_m->get()); + CHECK(m) << "Current DMatrix type does not support set data."; + m->SetArrayData(c_interface_str); API_END(); } + +XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, + char const *indices, char const *data, + xgboost::bst_ulong ncol) { + API_BEGIN(); + CHECK_HANDLE(); + auto p_m = static_cast *>(handle); + CHECK(p_m); + auto m = static_cast(p_m->get()); + CHECK(m) << "Current DMatrix type does not support set data."; + m->SetCSRData(indptr, indices, data, ncol, true); + API_END(); +} + // End Create from data iterator XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, diff --git a/src/common/quantile.cu b/src/common/quantile.cu index e9f3e93d0..07949d18b 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -91,7 +91,6 @@ void PruneImpl(int device, } float w = back.rmin - front.rmax; - assert(w != 0); auto budget = static_cast(d_out.size()); assert(budget != 0); auto q = ((static_cast(idx) * w) / (static_cast(to) - 1.0f) + front.rmax); diff --git a/src/data/data.cc b/src/data/data.cc index 536a836ec..99041ef96 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -22,11 +22,10 @@ #include "../common/threading_utils.h" #include "../data/adapter.h" #include "../data/iterative_device_dmatrix.h" +#include "file_iterator.h" -#if DMLC_ENABLE_STD_THREAD #include "./sparse_page_source.h" #include "./sparse_page_dmatrix.h" -#endif // DMLC_ENABLE_STD_THREAD namespace dmlc { DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>); @@ -500,13 +499,17 @@ void MetaInfo::GetFeatureInfo(const char *field, } } -void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) { +void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_column) { if (accumulate_rows) { this->num_row_ += that.num_row_; } if (this->num_col_ != 0) { - CHECK_EQ(this->num_col_, that.num_col_) - << "Number of columns must be consistent across batches."; + if (check_column) { + CHECK_EQ(this->num_col_, that.num_col_) + << "Number of columns must be consistent across batches."; + } else { + this->num_col_ = std::max(this->num_col_, that.num_col_); + } } this->num_col_ = that.num_col_; @@ -630,11 +633,34 @@ DMatrix::~DMatrix() { } } +DMatrix *TryLoadBinary(std::string fname, bool silent) { + int magic; + std::unique_ptr fi( + dmlc::Stream::Create(fname.c_str(), "r", true)); + if (fi != nullptr) { + common::PeekableInStream is(fi.get()); + if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) { + if (!DMLC_IO_NO_ENDIAN_SWAP) { + dmlc::ByteSwap(&magic, sizeof(magic), 1); + } + if (magic == data::SimpleDMatrix::kMagic) { + DMatrix *dmat = new data::SimpleDMatrix(&is); + if (!silent) { + LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ + << " matrix with " << dmat->Info().num_nonzero_ + << " entries loaded from " << fname; + } + return dmat; + } + } + } + return nullptr; +} + DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split, - const std::string& file_format, - const size_t page_size) { + const std::string& file_format) { std::string fname, cache_file; size_t dlm_pos = uri.find('#'); if (dlm_pos != std::string::npos) { @@ -682,35 +708,34 @@ DMatrix* DMatrix::Load(const std::string& uri, // legacy handling of binary data loading if (file_format == "auto" && npart == 1) { - int magic; - std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r", true)); - if (fi != nullptr) { - common::PeekableInStream is(fi.get()); - if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) { - if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(&magic, sizeof(magic), 1); - } - if (magic == data::SimpleDMatrix::kMagic) { - DMatrix* dmat = new data::SimpleDMatrix(&is); - if (!silent) { - LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " - << dmat->Info().num_nonzero_ << " entries loaded from " << uri; - } - return dmat; - } - } + DMatrix *loaded = TryLoadBinary(fname, silent); + if (loaded) { + return loaded; } } - std::unique_ptr > parser( - dmlc::Parser::Create(fname.c_str(), partid, npart, file_format.c_str())); - data::FileAdapter adapter(parser.get()); DMatrix* dmat {nullptr}; - try { - dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 1, - cache_file, page_size); - } catch (dmlc::Error& e) { + if (cache_file.empty()) { + std::unique_ptr> parser( + dmlc::Parser::Create(fname.c_str(), partid, npart, + file_format.c_str())); + data::FileAdapter adapter(parser.get()); + dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), + 1, cache_file); + } else { + data::FileIterator iter{fname, uint32_t(partid), uint32_t(npart), + file_format}; + dmat = new data::SparsePageDMatrix{ + &iter, + iter.Proxy(), + data::fileiter::Reset, + data::fileiter::Next, + std::numeric_limits::quiet_NaN(), + 1, + cache_file}; + } + } catch (dmlc::Error &e) { std::vector splited = common::Split(fname, '#'); std::vector args = common::Split(splited.front(), '?'); std::string format {file_format}; @@ -734,10 +759,6 @@ DMatrix* DMatrix::Load(const std::string& uri, LOG(FATAL) << "Encountered parser error:\n" << e.what(); } - if (!silent) { - LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " - << dmat->Info().num_nonzero_ << " entries loaded from " << uri; - } /* sync up number of features after matrix loaded. * partitioned data will fail the train/val validation check * since partitioned data not knowing the real number of features. */ @@ -769,12 +790,19 @@ DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy, XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin) { -#if defined(XGBOOST_USE_CUDA) - return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing, nthread, max_bin); -#else - common::AssertGPUSupport(); - return nullptr; -#endif + return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing, + nthread, max_bin); +} + +template +DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy, + DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, float missing, + int32_t n_threads, + std::string cache) { + return new data::SparsePageDMatrix(iter, proxy, reset, next, missing, n_threads, + cache); } template DMatrix *DMatrix::Create( + DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string); + template DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size) { - if (cache_prefix.length() == 0) { - // Data split mode is fixed to be row right now. - return new data::SimpleDMatrix(adapter, missing, nthread); - } else { -#if DMLC_ENABLE_STD_THREAD - return new data::SparsePageDMatrix(adapter, missing, nthread, cache_prefix, - page_size); -#else - LOG(FATAL) << "External memory is not enabled in mingw"; - return nullptr; -#endif // DMLC_ENABLE_STD_THREAD - } + const std::string& cache_prefix) { + return new data::SimpleDMatrix(adapter, missing, nthread); } template DMatrix* DMatrix::Create( data::DenseAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::ArrayAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::CSRAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::CSCAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::DataTableAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::FileAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::CSRArrayAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix * DMatrix::Create(data::IteratorAdapter *adapter, - float missing, int nthread, const std::string &cache_prefix, - size_t page_size); + float missing, int nthread, const std::string &cache_prefix); SparsePage SparsePage::GetTranspose(int num_columns) const { SparsePage transpose; @@ -1044,6 +1065,8 @@ SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthrea template uint64_t SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread); template uint64_t +SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, int nthread); +template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread); diff --git a/src/data/data.cu b/src/data/data.cu index dffe19d66..a9397803c 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -167,7 +167,7 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { template DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size) { + const std::string& cache_prefix) { CHECK_EQ(cache_prefix.size(), 0) << "Device memory construction is not currently supported with external " "memory."; @@ -176,8 +176,8 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, template DMatrix* DMatrix::Create( data::CudfAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::CupyAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix, size_t page_size); + const std::string& cache_prefix); } // namespace xgboost diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 3574f7d33..2315700dc 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -122,6 +122,7 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) dmat->Info().feature_types.SetDevice(param.gpu_id); auto ft = dmat->Info().feature_types.ConstDeviceSpan(); monitor_.Start("BinningCompression"); + CHECK(dmat->SingleColBlock()); for (const auto& batch : dmat->GetBatches()) { CreateHistIndices(param.gpu_id, batch, ft); } @@ -301,9 +302,8 @@ struct CopyPage { // The number of elements to skip. size_t offset; - CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset) - : cbw{dst->NumSymbols()}, - dst_data_d{dst->gidx_buffer.DevicePointer()}, + CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset) + : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()}, src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, offset(offset) {} @@ -314,7 +314,8 @@ struct CopyPage { }; // Copy the data from the given EllpackPage to the current page. -size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) { +size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page, + size_t offset) { monitor_.Start("Copy"); size_t num_elements = page->n_rows * page->row_stride; CHECK_EQ(row_stride, page->row_stride); @@ -351,7 +352,7 @@ struct CompactPage { size_t base_rowid; size_t row_stride; - CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, + CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span row_indexes) : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()}, @@ -374,7 +375,7 @@ struct CompactPage { }; // Compacts the data from the given EllpackPage into the current page. -void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, +void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page, common::Span row_indexes) { monitor_.Start("Compact"); CHECK_EQ(row_stride, page->row_stride); @@ -459,7 +460,7 @@ void EllpackPageImpl::CreateHistIndices(int device, gidx_buffer.DevicePointer(), row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), device_accessor.feature_segments.data(), feature_types, - row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride, + batch_row_begin, batch_nrows, row_stride, null_gidx_value); } } diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 01861d141..bd7a574d6 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -164,7 +164,7 @@ class EllpackPageImpl { * @param offset The number of elements to skip before copying. * @returns The number of elements copied. */ - size_t Copy(int device, EllpackPageImpl* page, size_t offset); + size_t Copy(int device, EllpackPageImpl const *page, size_t offset); /*! \brief Compact the given ELLPACK page into the current page. * @@ -172,7 +172,7 @@ class EllpackPageImpl { * @param page The ELLPACK page to compact from. * @param row_indexes Row indexes for the compacted page. */ - void Compact(int device, EllpackPageImpl* page, common::Span row_indexes); + void Compact(int device, EllpackPageImpl const* page, common::Span row_indexes); /*! \return Number of instances in the page. */ diff --git a/src/data/ellpack_page_source.cc b/src/data/ellpack_page_source.cc deleted file mode 100644 index 2838dc400..000000000 --- a/src/data/ellpack_page_source.cc +++ /dev/null @@ -1,24 +0,0 @@ -/*! - * Copyright 2019 XGBoost contributors - */ -#ifndef XGBOOST_USE_CUDA -#include -#if DMLC_ENABLE_STD_THREAD - -#include "ellpack_page_source.h" -#include -namespace xgboost { -namespace data { - -EllpackPageSource::EllpackPageSource(DMatrix* dmat, - const std::string& cache_info, - const BatchParam& param) noexcept(false) { - LOG(FATAL) - << "Internal Error: " - "XGBoost is not compiled with CUDA but EllpackPageSource is required"; -} - -} // namespace data -} // namespace xgboost -#endif // DMLC_ENABLE_STD_THREAD -#endif // XGBOOST_USE_CUDA diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 1729c9d8c..115d593e1 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -1,89 +1,24 @@ /*! - * Copyright 2019 XGBoost contributors + * Copyright 2019-2021 XGBoost contributors */ #include #include -#include "../common/hist_util.cuh" - #include "ellpack_page.cuh" #include "ellpack_page_source.h" -#include "sparse_page_source.h" namespace xgboost { namespace data { - -// Build the quantile sketch across the whole input data, then use the histogram cuts to compress -// each CSR page, and write the accumulated ELLPACK pages to disk. -EllpackPageSource::EllpackPageSource(DMatrix* dmat, - const std::string& cache_info, - const BatchParam& param) noexcept(false) { - cache_info_ = ParseCacheInfo(cache_info, kPageType_); - for (auto file : cache_info_.name_shards) { - CheckCacheFileExists(file); - } - if (param.gpu_page_size > 0) { - page_size_ = param.gpu_page_size; - } - - monitor_.Init("ellpack_page_source"); - dh::safe_cuda(cudaSetDevice(param.gpu_id)); - - monitor_.Start("Quantiles"); - size_t row_stride = GetRowStride(dmat); - auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin); - monitor_.Stop("Quantiles"); - - monitor_.Start("WriteEllpackPages"); - WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride); - monitor_.Stop("WriteEllpackPages"); - - external_prefetcher_.reset( - new ExternalMemoryPrefetcher(cache_info_)); -} - -// Compress each CSR page to ELLPACK, and write the accumulated pages to disk. -void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat, - const common::HistogramCuts& cuts, - const std::string& cache_info, - size_t row_stride) const { - auto cinfo = ParseCacheInfo(cache_info, kPageType_); - const size_t extra_buffer_capacity = 6; - SparsePageWriter writer(cinfo.name_shards, cinfo.format_shards, - extra_buffer_capacity); - std::shared_ptr page; - SparsePage temp_host_page; - writer.Alloc(&page); - auto* impl = page->Impl(); - auto ft = dmat->Info().feature_types.ConstDeviceSpan(); - - size_t bytes_write = 0; - double tstart = dmlc::GetTime(); - for (const auto& batch : dmat->GetBatches()) { - temp_host_page.Push(batch); - - size_t mem_cost_bytes = - EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts); - if (mem_cost_bytes >= page_size_) { - bytes_write += mem_cost_bytes; - *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(), - row_stride, ft); - writer.PushWrite(std::move(page)); - writer.Alloc(&page); - impl = page->Impl(); - temp_host_page.Clear(); - double tdiff = dmlc::GetTime() - tstart; - LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in " - << ((bytes_write >> 20UL) / tdiff) << " MB/s, " - << (bytes_write >> 20UL) << " written"; - } - } - if (temp_host_page.Size() != 0) { - *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(), - row_stride, ft); - writer.PushWrite(std::move(page)); +void EllpackPageSource::Fetch() { + if (!this->ReadCache()) { + auto const &csr = source_->Page(); + this->page_.reset(new EllpackPage{}); + auto *impl = this->page_->Impl(); + *impl = EllpackPageImpl(param_.gpu_id, *cuts_, *csr, is_dense_, row_stride_, + feature_types_); + page_->SetBaseRowId(csr->base_rowid); + this->WriteCache(); } } - } // namespace data } // namespace xgboost diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index a1ce587c2..a43ac9881 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -1,5 +1,5 @@ /*! - * Copyright 2019 by XGBoost Contributors + * Copyright 2019-2021 by XGBoost Contributors */ #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_ @@ -8,57 +8,44 @@ #include #include #include +#include -#include "../common/timer.h" +#include "../common/common.h" #include "../common/hist_util.h" #include "sparse_page_source.h" namespace xgboost { namespace data { -/*! - * \brief External memory data source for ELLPACK format. - * - */ -class EllpackPageSource { +class EllpackPageSource : public PageSourceIncMixIn { + bool is_dense_; + size_t row_stride_; + BatchParam param_; + common::Span feature_types_; + std::unique_ptr cuts_; + public: - /*! - * \brief Create source from cache files the cache_prefix. - * \param cache_prefix The prefix of cache we want to solve. - */ - explicit EllpackPageSource(DMatrix* dmat, - const std::string& cache_info, - const BatchParam& param) noexcept(false); - - BatchSet GetBatchSet() { - auto begin_iter = BatchIterator( - new SparseBatchIteratorImpl, - EllpackPage>(external_prefetcher_.get())); - return BatchSet(begin_iter); + EllpackPageSource( + float missing, int nthreads, bst_feature_t n_features, size_t n_batches, + std::shared_ptr cache, BatchParam param, + std::unique_ptr cuts, bool is_dense, + size_t row_stride, common::Span feature_types, + std::shared_ptr source) + : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache), + is_dense_{is_dense}, row_stride_{row_stride}, param_{param}, + feature_types_{feature_types}, cuts_{std::move(cuts)} { + this->source_ = source; + this->Fetch(); } - ~EllpackPageSource() { - external_prefetcher_.reset(); - for (auto file : cache_info_.name_shards) { - TryDeleteCacheFile(file); - } - } - - private: - void WriteEllpackPages(int device, DMatrix* dmat, - const common::HistogramCuts& cuts, - const std::string& cache_info, - size_t row_stride) const; - - /*! \brief The page type string for ELLPACK. */ - const std::string kPageType_{".ellpack.page"}; - - size_t page_size_{DMatrix::kPageSize}; - common::Monitor monitor_; - std::unique_ptr> external_prefetcher_; - CacheInfo cache_info_; + void Fetch() final; }; +#if !defined(XGBOOST_USE_CUDA) +inline void EllpackPageSource::Fetch() { + common::AssertGPUSupport(); +} +#endif // !defined(XGBOOST_USE_CUDA) } // namespace data } // namespace xgboost diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h new file mode 100644 index 000000000..6d6adb62b --- /dev/null +++ b/src/data/file_iterator.h @@ -0,0 +1,115 @@ +/*! + * Copyright 2021 XGBoost contributors + */ +#ifndef XGBOOST_DATA_FILE_ITERATOR_H_ +#define XGBOOST_DATA_FILE_ITERATOR_H_ + +#include +#include +#include +#include + +#include "dmlc/data.h" +#include "xgboost/c_api.h" +#include "xgboost/json.h" +#include "array_interface.h" + +namespace xgboost { +namespace data { +/** + * An iterator for implementing external memory support with file inputs. Users of + * external memory are encouraged to define their own file parsers/loaders so this one is + * just here for compatibility with old versions of XGBoost and CLI interface. + */ +class FileIterator { + // uri of input file, encodes parameters about whether it's 1-based index etc. dmlc + // parser will decode these information. + std::string uri_; + // Equals to rank_id in distributed training, used to split file into parts for each + // worker. + uint32_t part_idx_; + // Equals to total number of workers. + uint32_t n_parts_; + // Format of the input file, like "libsvm". + std::string type_; + + DMatrixHandle proxy_; + + std::unique_ptr> parser_; + // Temporary reference to stage the data. + dmlc::RowBlock row_block_; + // Storage for the array interface strings. + std::string indptr_; + std::string values_; + std::string indices_; + + public: + FileIterator(std::string uri, unsigned part_index, unsigned num_parts, + std::string type) + : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts}, + type_{std::move(type)} { + XGProxyDMatrixCreate(&proxy_); + } + ~FileIterator() { + XGDMatrixFree(proxy_); + } + + int Next() { + CHECK(parser_); + if (parser_->Next()) { + row_block_ = parser_->Value(); + + indptr_ = MakeArrayInterface(row_block_.offset, row_block_.size + 1); + values_ = MakeArrayInterface(row_block_.value, + row_block_.offset[row_block_.size]); + indices_ = MakeArrayInterface(row_block_.index, + row_block_.offset[row_block_.size]); + + size_t n_columns = *std::max_element( + row_block_.index, + row_block_.index + row_block_.offset[row_block_.size]); + // dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore + // this condition and just add 1 to n_columns + n_columns += 1; + + XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(), + values_.c_str(), n_columns); + + if (row_block_.label) { + XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1); + } + if (row_block_.qid) { + XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1); + } + if (row_block_.weight) { + XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1); + } + // Continue iteration + return true; + } else { + // Stop iteration + return false; + } + } + + auto Proxy() -> decltype(proxy_) { return proxy_; } + + void Reset() { + CHECK(!type_.empty()); + parser_.reset(dmlc::Parser::Create(uri_.c_str(), part_idx_, + n_parts_, type_.c_str())); + } +}; + +namespace fileiter { +inline void Reset(DataIterHandle self) { + static_cast(self)->Reset(); +} + +inline int Next(DataIterHandle self) { + return static_cast(self)->Next(); +} +} // namespace fileiter +} // namespace data +} // namespace xgboost +#endif // XGBOOST_DATA_FILE_ITERATOR_H_ diff --git a/src/data/iterative_device_dmatrix.cu b/src/data/iterative_device_dmatrix.cu index 4b584a4e7..00e502dfa 100644 --- a/src/data/iterative_device_dmatrix.cu +++ b/src/data/iterative_device_dmatrix.cu @@ -143,7 +143,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin proxy->Info().num_row_ = num_rows(); proxy->Info().num_col_ = cols; if (batches != 1) { - this->info_.Extend(std::move(proxy->Info()), false); + this->info_.Extend(std::move(proxy->Info()), false, true); } n_batches_for_verification++; } @@ -163,7 +163,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin BatchSet IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) { CHECK(page_); auto begin_iter = - BatchIterator(new SimpleBatchIteratorImpl(page_.get())); + BatchIterator(new SimpleBatchIteratorImpl(page_)); return BatchSet(begin_iter); } } // namespace data diff --git a/src/data/iterative_device_dmatrix.h b/src/data/iterative_device_dmatrix.h index a9923552c..232b50102 100644 --- a/src/data/iterative_device_dmatrix.h +++ b/src/data/iterative_device_dmatrix.h @@ -14,6 +14,7 @@ #include "xgboost/data.h" #include "xgboost/c_api.h" #include "proxy_dmatrix.h" +#include "simple_batch_iterator.h" namespace xgboost { namespace data { @@ -36,9 +37,10 @@ class IterativeDeviceDMatrix : public DMatrix { XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin) : proxy_{proxy}, reset_{reset}, next_{next} { - batch_param_ = BatchParam{0, max_bin, 0}; + batch_param_ = BatchParam{0, max_bin}; this->Initialize(iter, missing, nthread); } + ~IterativeDeviceDMatrix() override = default; bool EllpackExists() const override { return true; } bool SparsePageExists() const override { return false; } @@ -74,6 +76,18 @@ class IterativeDeviceDMatrix : public DMatrix { return info_; } }; + +#if !defined(XGBOOST_USE_CUDA) +inline void IterativeDeviceDMatrix::Initialize(DataIterHandle iter, float missing, int nthread) { + common::AssertGPUSupport(); +} +inline BatchSet IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) { + common::AssertGPUSupport(); + auto begin_iter = + BatchIterator(new SimpleBatchIteratorImpl(page_)); + return BatchSet(BatchIterator(begin_iter)); +} +#endif // !defined(XGBOOST_USE_CUDA) } // namespace data } // namespace xgboost diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h index 4baeec0b3..11d664666 100644 --- a/src/data/proxy_dmatrix.h +++ b/src/data/proxy_dmatrix.h @@ -1,5 +1,5 @@ /*! - * Copyright 2020 XGBoost contributors + * Copyright 2020-2021 XGBoost contributors */ #ifndef XGBOOST_DATA_PROXY_DMATRIX_H_ #define XGBOOST_DATA_PROXY_DMATRIX_H_ diff --git a/src/data/simple_batch_iterator.h b/src/data/simple_batch_iterator.h index 53464c6fa..62088d24f 100644 --- a/src/data/simple_batch_iterator.h +++ b/src/data/simple_batch_iterator.h @@ -1,10 +1,13 @@ /*! - * Copyright 2019 XGBoost contributors + * Copyright 2019-2021 XGBoost contributors */ #ifndef XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_ #define XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_ -#include +#include +#include + +#include "xgboost/data.h" namespace xgboost { namespace data { @@ -12,20 +15,21 @@ namespace data { template class SimpleBatchIteratorImpl : public BatchIteratorImpl { public: - explicit SimpleBatchIteratorImpl(T* page) : page_(page) {} - T& operator*() override { - CHECK(page_ != nullptr); - return *page_; - } + explicit SimpleBatchIteratorImpl(std::shared_ptr page) : page_(std::move(page)) {} const T& operator*() const override { CHECK(page_ != nullptr); return *page_; } - void operator++() override { page_ = nullptr; } + SimpleBatchIteratorImpl &operator++() override { + page_ = nullptr; + return *this; + } bool AtEnd() const override { return page_ == nullptr; } + std::shared_ptr Page() const override { return page_; } + private: - T* page_{nullptr}; + std::shared_ptr page_{nullptr}; }; } // namespace data diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index bcab52e48..a737c6d59 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -1,5 +1,5 @@ /*! - * Copyright 2014~2020 by Contributors + * Copyright 2014~2021 by Contributors * \file simple_dmatrix.cc * \brief the input data structure for gradient boosting * \author Tianqi Chen @@ -27,7 +27,7 @@ const MetaInfo& SimpleDMatrix::Info() const { return info_; } DMatrix* SimpleDMatrix::Slice(common::Span ridxs) { auto out = new SimpleDMatrix; - SparsePage& out_page = out->sparse_page_; + SparsePage& out_page = *out->sparse_page_; for (auto const &page : this->GetBatches()) { auto batch = page.GetView(); auto& h_data = out_page.data.HostVector(); @@ -48,17 +48,17 @@ DMatrix* SimpleDMatrix::Slice(common::Span ridxs) { BatchSet SimpleDMatrix::GetRowBatches() { // since csr is the default data structure so `source_` is always available. auto begin_iter = BatchIterator( - new SimpleBatchIteratorImpl(&sparse_page_)); + new SimpleBatchIteratorImpl(sparse_page_)); return BatchSet(begin_iter); } BatchSet SimpleDMatrix::GetColumnBatches() { // column page doesn't exist, generate it if (!column_page_) { - column_page_.reset(new CSCPage(sparse_page_.GetTranspose(info_.num_col_))); + column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_))); } auto begin_iter = - BatchIterator(new SimpleBatchIteratorImpl(column_page_.get())); + BatchIterator(new SimpleBatchIteratorImpl(column_page_)); return BatchSet(begin_iter); } @@ -66,11 +66,11 @@ BatchSet SimpleDMatrix::GetSortedColumnBatches() { // Sorted column page doesn't exist, generate it if (!sorted_column_page_) { sorted_column_page_.reset( - new SortedCSCPage(sparse_page_.GetTranspose(info_.num_col_))); + new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_))); sorted_column_page_->SortRows(); } auto begin_iter = BatchIterator( - new SimpleBatchIteratorImpl(sorted_column_page_.get())); + new SimpleBatchIteratorImpl(sorted_column_page_)); return BatchSet(begin_iter); } @@ -86,7 +86,7 @@ BatchSet SimpleDMatrix::GetEllpackBatches(const BatchParam& param) batch_param_ = param; } auto begin_iter = - BatchIterator(new SimpleBatchIteratorImpl(ellpack_page_.get())); + BatchIterator(new SimpleBatchIteratorImpl(ellpack_page_)); return BatchSet(begin_iter); } @@ -100,7 +100,7 @@ BatchSet SimpleDMatrix::GetGradientIndex(const BatchParam& par batch_param_ = param; } auto begin_iter = BatchIterator( - new SimpleBatchIteratorImpl(gradient_index_.get())); + new SimpleBatchIteratorImpl(gradient_index_)); return BatchSet(begin_iter); } @@ -110,8 +110,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { uint64_t default_max = std::numeric_limits::max(); uint64_t last_group_id = default_max; bst_uint group_size = 0; - auto& offset_vec = sparse_page_.offset.HostVector(); - auto& data_vec = sparse_page_.data.HostVector(); + auto& offset_vec = sparse_page_->offset.HostVector(); + auto& data_vec = sparse_page_->data.HostVector(); uint64_t inferred_num_columns = 0; uint64_t total_batch_size = 0; // batch_size is either number of rows or cols, depending on data layout @@ -120,7 +120,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { // Iterate over batches of input data while (adapter->Next()) { auto& batch = adapter->Value(); - auto batch_max_columns = sparse_page_.Push(batch, missing, nthread); + auto batch_max_columns = sparse_page_->Push(batch, missing, nthread); inferred_num_columns = std::max(batch_max_columns, inferred_num_columns); total_batch_size += batch.Size(); // Append meta information if available @@ -203,8 +203,8 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) { CHECK(in_stream->Read(&tmagic)) << "invalid input file format"; CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch"; info_.LoadBinary(in_stream); - in_stream->Read(&sparse_page_.offset.HostVector()); - in_stream->Read(&sparse_page_.data.HostVector()); + in_stream->Read(&sparse_page_->offset.HostVector()); + in_stream->Read(&sparse_page_->data.HostVector()); } void SimpleDMatrix::SaveToLocalFile(const std::string& fname) { @@ -212,8 +212,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) { int tmagic = kMagic; fo->Write(tmagic); info_.SaveBinary(fo.get()); - fo->Write(sparse_page_.offset.HostVector()); - fo->Write(sparse_page_.data.HostVector()); + fo->Write(sparse_page_->offset.HostVector()); + fo->Write(sparse_page_->data.HostVector()); } template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu index 87f7fa2a0..80de6706c 100644 --- a/src/data/simple_dmatrix.cu +++ b/src/data/simple_dmatrix.cu @@ -28,7 +28,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { CHECK(!adapter->Next()); info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(), - missing, &sparse_page_); + missing, sparse_page_.get()); info_.num_col_ = adapter->NumColumns(); info_.num_row_ = adapter->NumRows(); // Synchronise worker columns diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h index aa555b212..3529f1de4 100644 --- a/src/data/simple_dmatrix.h +++ b/src/data/simple_dmatrix.h @@ -1,5 +1,5 @@ /*! - * Copyright 2015 by Contributors + * Copyright 2015-2021 by Contributors * \file simple_dmatrix.h * \brief In-memory version of DMatrix. * \author Tianqi Chen @@ -47,11 +47,12 @@ class SimpleDMatrix : public DMatrix { BatchSet GetGradientIndex(const BatchParam& param) override; MetaInfo info_; - SparsePage sparse_page_; // Primary storage type - std::unique_ptr column_page_; - std::unique_ptr sorted_column_page_; - std::unique_ptr ellpack_page_; - std::unique_ptr gradient_index_; + // Primary storage type + std::shared_ptr sparse_page_ = std::make_shared(); + std::shared_ptr column_page_; + std::shared_ptr sorted_column_page_; + std::shared_ptr ellpack_page_; + std::shared_ptr gradient_index_; BatchParam batch_param_; bool EllpackExists() const override { diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc index 4d903f87d..d6e26195b 100644 --- a/src/data/sparse_page_dmatrix.cc +++ b/src/data/sparse_page_dmatrix.cc @@ -1,59 +1,147 @@ /*! - * Copyright 2014-2020 by Contributors + * Copyright 2014-2021 by Contributors * \file sparse_page_dmatrix.cc * \brief The external memory version of Page Iterator. * \author Tianqi Chen */ -#include -#include - -#if DMLC_ENABLE_STD_THREAD #include "./sparse_page_dmatrix.h" - #include "./simple_batch_iterator.h" +#include "gradient_index.h" namespace xgboost { namespace data { -MetaInfo& SparsePageDMatrix::Info() { - return row_source_->info; +MetaInfo &SparsePageDMatrix::Info() { return info_; } + +const MetaInfo &SparsePageDMatrix::Info() const { return info_; } + +SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy_handle, + DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, float missing, + int32_t nthreads, std::string cache_prefix) + : proxy_{proxy_handle}, iter_{iter_handle}, reset_{reset}, next_{next}, missing_{missing}, + nthreads_{nthreads}, cache_prefix_{std::move(cache_prefix)} { + cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_; + if (rabit::IsDistributed()) { + cache_prefix_ += ("-r" + std::to_string(rabit::GetRank())); + } + DMatrixProxy *proxy = MakeProxy(proxy_); + auto iter = DataIterProxy{ + iter_, reset_, next_}; + + uint32_t n_batches = 0; + size_t n_features = 0; + size_t n_samples = 0; + size_t nnz = 0; + + auto num_rows = [&]() { + return HostAdapterDispatch( + proxy, [](auto const &value) { return value.NumRows(); }); + }; + auto num_cols = [&]() { + return HostAdapterDispatch( + proxy, [](auto const &value) { return value.NumCols(); }); + }; + // the proxy is iterated together with the sparse page source so we can obtain all + // information in 1 pass. + for (auto const &page : this->GetRowBatchesImpl()) { + this->info_.Extend(std::move(proxy->Info()), false, false); + n_features = std::max(n_features, num_cols()); + n_samples += num_rows(); + nnz += page.data.Size(); + n_batches++; + } + + iter.Reset(); + + this->n_batches_ = n_batches; + this->info_.num_row_ = n_samples; + this->info_.num_col_ = n_features; + this->info_.num_nonzero_ = nnz; + + rabit::Allreduce(&info_.num_col_, 1); + CHECK_NE(info_.num_col_, 0); } -const MetaInfo& SparsePageDMatrix::Info() const { - return row_source_->info; +void SparsePageDMatrix::InitializeSparsePage() { + auto id = MakeCache(this, ".row.page", cache_prefix_, &cache_info_); + // Don't use proxy DMatrix once this is already initialized, this allows users to + // release the iterator and data. + if (cache_info_.at(id)->written) { + CHECK(sparse_page_source_); + sparse_page_source_->Reset(); + return; + } + + auto iter = DataIterProxy{ + iter_, reset_, next_}; + DMatrixProxy *proxy = MakeProxy(proxy_); + sparse_page_source_.reset(); // clear before creating new one to prevent conflicts. + sparse_page_source_ = std::make_shared( + iter, proxy, this->missing_, this->nthreads_, this->info_.num_col_, + this->n_batches_, cache_info_.at(id)); +} + +BatchSet SparsePageDMatrix::GetRowBatchesImpl() { + this->InitializeSparsePage(); + auto begin_iter = BatchIterator(sparse_page_source_); + return BatchSet(BatchIterator(begin_iter)); } BatchSet SparsePageDMatrix::GetRowBatches() { - return row_source_->GetBatchSet(); + return this->GetRowBatchesImpl(); } BatchSet SparsePageDMatrix::GetColumnBatches() { - // Lazily instantiate + auto id = MakeCache(this, ".col.page", cache_prefix_, &cache_info_); + CHECK_NE(this->Info().num_col_, 0); + this->InitializeSparsePage(); if (!column_source_) { - column_source_.reset(new CSCPageSource(this, cache_info_)); + column_source_ = std::make_shared( + this->missing_, this->nthreads_, this->Info().num_col_, + this->n_batches_, cache_info_.at(id), sparse_page_source_); + } else { + column_source_->Reset(); } - return column_source_->GetBatchSet(); + auto begin_iter = BatchIterator(column_source_); + return BatchSet(BatchIterator(begin_iter)); } BatchSet SparsePageDMatrix::GetSortedColumnBatches() { - // Lazily instantiate + auto id = MakeCache(this, ".sorted.col.page", cache_prefix_, &cache_info_); + CHECK_NE(this->Info().num_col_, 0); + this->InitializeSparsePage(); if (!sorted_column_source_) { - sorted_column_source_.reset(new SortedCSCPageSource(this, cache_info_)); + sorted_column_source_ = std::make_shared( + this->missing_, this->nthreads_, this->Info().num_col_, + this->n_batches_, cache_info_.at(id), sparse_page_source_); + } else { + sorted_column_source_->Reset(); } - return sorted_column_source_->GetBatchSet(); + auto begin_iter = BatchIterator(sorted_column_source_); + return BatchSet(BatchIterator(begin_iter)); } -BatchSet SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) { - CHECK_GE(param.gpu_id, 0); +BatchSet SparsePageDMatrix::GetGradientIndex(const BatchParam& param) { CHECK_GE(param.max_bin, 2); - // Lazily instantiate - if (!ellpack_source_ || (batch_param_ != param && param != BatchParam{})) { - ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param)); + // External memory is not support + if (!ghist_index_source_ || (param != batch_param_ && param != BatchParam{})) { + this->InitializeSparsePage(); + ghist_index_source_.reset(new GHistIndexMatrix{this, param.max_bin}); batch_param_ = param; } - return ellpack_source_->GetBatchSet(); + this->InitializeSparsePage(); + auto begin_iter = BatchIterator( + new SimpleBatchIteratorImpl(ghist_index_source_)); + return BatchSet(begin_iter); } +#if !defined(XGBOOST_USE_CUDA) +BatchSet SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) { + common::AssertGPUSupport(); + auto begin_iter = BatchIterator(ellpack_page_source_); + return BatchSet(BatchIterator(begin_iter)); +} +#endif // !defined(XGBOOST_USE_CUDA) } // namespace data } // namespace xgboost -#endif // DMLC_ENABLE_STD_THREAD diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu new file mode 100644 index 000000000..176cdc75b --- /dev/null +++ b/src/data/sparse_page_dmatrix.cu @@ -0,0 +1,46 @@ +/*! + * Copyright 2021 XGBoost contributors + */ +#include "sparse_page_source.h" +#include "../common/hist_util.cuh" +#include "ellpack_page.cuh" +#include "sparse_page_dmatrix.h" + +namespace xgboost { +namespace data { +BatchSet SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) { + CHECK_GE(param.gpu_id, 0); + CHECK_GE(param.max_bin, 2); + auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_); + size_t row_stride = 0; + this->InitializeSparsePage(); + if (!cache_info_.at(id)->written || (batch_param_ != param && param != BatchParam{})) { + // reinitialize the cache + cache_info_.erase(id); + MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_); + std::unique_ptr cuts; + cuts.reset(new common::HistogramCuts{ + common::DeviceSketch(param.gpu_id, this, param.max_bin, 0)}); + this->InitializeSparsePage(); // reset after use. + + row_stride = GetRowStride(this); + this->InitializeSparsePage(); // reset after use. + CHECK_NE(row_stride, 0); + batch_param_ = param; + + auto ft = this->info_.feature_types.ConstDeviceSpan(); + ellpack_page_source_.reset(); // release resources. + ellpack_page_source_.reset(new EllpackPageSource( + this->missing_, this->nthreads_, this->Info().num_col_, + this->n_batches_, cache_info_.at(id), param, std::move(cuts), + this->IsDense(), row_stride, ft, sparse_page_source_)); + } else { + CHECK(sparse_page_source_); + ellpack_page_source_->Reset(); + } + + auto begin_iter = BatchIterator(ellpack_page_source_); + return BatchSet(BatchIterator(begin_iter)); +} +} // namespace data +} // namespace xgboost diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h index a4eaeed71..3164a3ee3 100644 --- a/src/data/sparse_page_dmatrix.h +++ b/src/data/sparse_page_dmatrix.h @@ -1,5 +1,5 @@ /*! - * Copyright 2015 by Contributors + * Copyright 2015-2021 by Contributors * \file sparse_page_dmatrix.h * \brief External-memory version of DMatrix. * \author Tianqi Chen @@ -13,24 +13,88 @@ #include #include #include +#include #include "ellpack_page_source.h" #include "sparse_page_source.h" namespace xgboost { namespace data { -// Used for external memory. +/** + * \brief DMatrix used for external memory. + * + * The external memory is created for controlling memory usage by splitting up data into + * multiple batches. However that doesn't mean we will actually process exact 1 batch at + * a time, which would be terribly slow considering that we have to loop through the + * whole dataset for every tree split. So we use async pre-fetch and let caller to decide + * how many batches it wants to process by returning data as shared pointer. The caller + * can use async function to process the data or just stage those batches, making the + * decision is out of the scope for sparse page dmatrix. These 2 optimizations might + * defeat the purpose of splitting up dataset since if you load all the batches then the + * memory usage is even worse than using a single batch. Essentially we need to control + * how many batches can be in memory at the same time. + * + * Right now the write to the cache is sequential operation and is blocking, reading from + * cache is async but with a hard coded limit of 4 pages as an heuristic. So by sparse + * dmatrix itself there can be only 9 pages in main memory (might be of different types) + * at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched + * dependent pages. If the caller stops iteration at the middle and start again, then the + * number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in + * caller's code (XGBoost doesn't discard a large portion of data at the end, there's not + * sampling algo that samples only the first portion of data). + * + * Of course if the caller decides to retain some batches to perform parallel processing, + * then we might load all pages in memory, which is also considered as a bug in caller's + * code. So if the algo supports external memory, it must be careful that queue for async + * call must have an upper limit. + * + * Another assumption we make is that the data must be immutable so caller should never + * change the data. Sparse page source returns const page to make sure of that. If you + * want to change the generated page like Ellpack, pass parameter into `GetBatches` to + * re-generate them instead of trying to modify the pages in-place. + * + * A possible optimization is dropping the sparse page once dependent pages like ellpack + * are constructed and cached. + */ class SparsePageDMatrix : public DMatrix { + MetaInfo info_; + BatchParam batch_param_; + std::map> cache_info_; + + DMatrixHandle proxy_; + DataIterHandle iter_; + DataIterResetCallback *reset_; + XGDMatrixCallbackNext *next_; + + float missing_; + int nthreads_; + std::string cache_prefix_; + uint32_t n_batches_ {0}; + // sparse page is the source to other page types, we make a special member function. + void InitializeSparsePage(); + // Non-virtual version that can be used in constructor + BatchSet GetRowBatchesImpl(); + public: - template - explicit SparsePageDMatrix(AdapterT* adapter, float missing, int nthread, - const std::string& cache_prefix, - size_t page_size = kPageSize) - : cache_info_(std::move(cache_prefix)) { - row_source_.reset(new data::SparsePageSource(adapter, missing, nthread, - cache_prefix, page_size)); + explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, + DataIterResetCallback *reset, + XGDMatrixCallbackNext *next, float missing, + int32_t nthreads, std::string cache_prefix); + + ~SparsePageDMatrix() override { + // Clear out all resources before deleting the cache file. + sparse_page_source_.reset(); + ellpack_page_source_.reset(); + column_source_.reset(); + sorted_column_source_.reset(); + ghist_index_source_.reset(); + + for (auto const &kv : cache_info_) { + CHECK(kv.second); + auto n = kv.second->ShardName(); + TryDeleteCacheFile(n); + } } - ~SparsePageDMatrix() override = default; MetaInfo& Info() override; @@ -47,30 +111,41 @@ class SparsePageDMatrix : public DMatrix { BatchSet GetColumnBatches() override; BatchSet GetSortedColumnBatches() override; BatchSet GetEllpackBatches(const BatchParam& param) override; - BatchSet GetGradientIndex(const BatchParam&) override { - LOG(FATAL) << "Not implemented."; - return BatchSet(BatchIterator(nullptr)); - } + BatchSet GetGradientIndex(const BatchParam&) override; // source data pointers. - std::unique_ptr row_source_; - std::unique_ptr column_source_; - std::unique_ptr sorted_column_source_; - std::unique_ptr ellpack_source_; - // saved batch param - BatchParam batch_param_; - // the cache prefix - std::string cache_info_; - // Store column densities to avoid recalculating - std::vector col_density_; + std::shared_ptr sparse_page_source_; + std::shared_ptr ellpack_page_source_; + std::shared_ptr column_source_; + std::shared_ptr sorted_column_source_; + std::shared_ptr ghist_index_source_; bool EllpackExists() const override { - return static_cast(ellpack_source_); + return static_cast(ellpack_page_source_); } bool SparsePageExists() const override { - return static_cast(row_source_); + return static_cast(sparse_page_source_); } }; + +inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) { + std::stringstream ss; + ss << ptr; + return prefix + "-" + ss.str(); +} + +inline std::string +MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix, + std::map> *out) { + auto &cache_info = *out; + auto name = MakeId(prefix, ptr); + auto id = name + format; + auto it = cache_info.find(id); + if (it == cache_info.cend()) { + cache_info[id].reset(new Cache{false, name, format}); + } + return id; +} } // namespace data } // namespace xgboost #endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_ diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc deleted file mode 100644 index 18376a18e..000000000 --- a/src/data/sparse_page_source.cc +++ /dev/null @@ -1,77 +0,0 @@ -/*! - * Copyright (c) 2020 by XGBoost Contributors - */ -#include "sparse_page_source.h" - -namespace xgboost { -namespace data { -void DataPool::Slice(std::shared_ptr out, size_t offset, - size_t n_rows, size_t entry_offset) const { - auto const &in_offset = pool_.offset.HostVector(); - auto const &in_data = pool_.data.HostVector(); - auto &h_offset = out->offset.HostVector(); - CHECK_LE(offset + n_rows + 1, in_offset.size()); - h_offset.resize(n_rows + 1, 0); - std::transform(in_offset.cbegin() + offset, - in_offset.cbegin() + offset + n_rows + 1, h_offset.begin(), - [=](size_t ptr) { return ptr - entry_offset; }); - - auto &h_data = out->data.HostVector(); - CHECK_GT(h_offset.size(), 0); - size_t n_entries = h_offset.back(); - h_data.resize(n_entries); - - CHECK_EQ(n_entries, in_offset.at(offset + n_rows) - in_offset.at(offset)); - std::copy_n(in_data.cbegin() + in_offset.at(offset), n_entries, - h_data.begin()); -} - -void DataPool::SplitWritePage() { - size_t total = pool_.Size(); - size_t offset = 0; - size_t entry_offset = 0; - do { - size_t n_rows = std::min(page_size_, total - offset); - std::shared_ptr out; - writer_->Alloc(&out); - out->Clear(); - out->SetBaseRowId(inferred_num_rows_); - this->Slice(out, offset, n_rows, entry_offset); - inferred_num_rows_ += out->Size(); - offset += n_rows; - entry_offset += out->data.Size(); - CHECK_NE(out->Size(), 0); - writer_->PushWrite(std::move(out)); - } while (total - offset >= page_size_); - - if (total - offset != 0) { - auto out = std::make_shared(); - this->Slice(out, offset, total - offset, entry_offset); - CHECK_NE(out->Size(), 0); - pool_.Clear(); - pool_.Push(*out); - } else { - pool_.Clear(); - } -} -size_t DataPool::Finalize() { - inferred_num_rows_ += pool_.Size(); - if (pool_.Size() != 0) { - std::shared_ptr page; - this->writer_->Alloc(&page); - page->Clear(); - page->Push(pool_); - this->writer_->PushWrite(std::move(page)); - } - - if (inferred_num_rows_ == 0) { - std::shared_ptr page; - this->writer_->Alloc(&page); - page->Clear(); - this->writer_->PushWrite(std::move(page)); - } - - return inferred_num_rows_; -} -} // namespace data -} // namespace xgboost diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu new file mode 100644 index 000000000..8c292ded6 --- /dev/null +++ b/src/data/sparse_page_source.cu @@ -0,0 +1,17 @@ +/*! + * Copyright 2021 XGBoost contributors + */ +#include "sparse_page_source.h" +#include "proxy_dmatrix.cuh" +#include "simple_dmatrix.cuh" + +namespace xgboost { +namespace data { +void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) { + auto device = proxy->DeviceIdx(); + Dispatch(proxy, [&](auto const &value) { + CopyToSparsePage(value, device, missing, page); + }); +} +} // namespace data +} // namespace xgboost diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index d36c6b07e..2b634e7aa 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -1,54 +1,18 @@ /*! - * Copyright (c) 2014-2019 by Contributors - * \file page_csr_source.h - * External memory data source, saved with sparse_batch_page binary format. - * \author Tianqi Chen - * - * ------------------------------------------------- - * Random notes on implementation of external memory - * ------------------------------------------------- - * - * As of XGBoost 1.3, the general pipeline is: - * - * dmlc text file parser --> file adapter --> sparse page source -> data pool --> - * write to binary cache --> load it back ~~> [ other pages (csc, ellpack, sorted csc) --> - * write to binary cache ] --> use it in various algorithms. - * - * ~~> means optional - * - * The dmlc text file parser returns number of blocks based on available threads, which - * can make the data partitioning non-deterministic, so here we set up an extra data pool - * to stage parsed data. As a result, the number of blocks returned by text parser does - * not equal to number of blocks in binary cache. - * - * Binary cache loading is async by the dmlc threaded iterator, which helps performance, - * but as this iterator itself is not thread safe, so calling - * `dmatrix->GetBatches` is also not thread safe. Please note that, the - * threaded iterator is also used inside dmlc text file parser. - * - * Memory consumption is difficult to control due to various reasons. Firstly the text - * parsing doesn't have a batch size, only a hard coded buffer size is available. - * Secondly, everything is loaded/written with async queue, with multiple queues running - * the memory consumption is difficult to measure. - * - * The threaded iterator relies heavily on C++ memory model and threading primitive. The - * concurrent writer for binary cache is an old copy of moody queue. We should try to - * replace them with something more robust. + * Copyright (c) 2014-2021 by Contributors + * \file sparse_page_source.h */ #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ -#include -#include - -#include -#include -#include -#include +#include // std::min #include #include #include -#include +#include +#include +#include +#include #include "rabit/rabit.h" #include "xgboost/base.h" @@ -56,93 +20,12 @@ #include "adapter.h" #include "sparse_page_writer.h" +#include "proxy_dmatrix.h" + #include "../common/common.h" -#include - -namespace detail { - -// Split a cache info string with delimiter ':' -// If cache info string contains drive letter (e.g. C:), exclude it before splitting -inline std::vector -GetCacheShards(const std::string& cache_info) { -#if (defined _WIN32) || (defined __CYGWIN__) - if (cache_info.length() >= 2 - && std::isalpha(cache_info[0], std::locale::classic()) - && cache_info[1] == ':') { - std::vector cache_shards - = xgboost::common::Split(cache_info.substr(2), ':'); - cache_shards[0] = cache_info.substr(0, 2) + cache_shards[0]; - return cache_shards; - } -#endif // (defined _WIN32) || (defined __CYGWIN__) - return xgboost::common::Split(cache_info, ':'); -} - -} // namespace detail namespace xgboost { namespace data { - -template -class SparseBatchIteratorImpl : public BatchIteratorImpl { - public: - explicit SparseBatchIteratorImpl(S* source) : source_(source) { - CHECK(source_ != nullptr); - source_->BeforeFirst(); - source_->Next(); - } - T& operator*() override { return source_->Value(); } - const T& operator*() const override { return source_->Value(); } - void operator++() override { at_end_ = !source_->Next(); } - bool AtEnd() const override { return at_end_; } - - private: - S* source_{nullptr}; - bool at_end_{ false }; -}; - - /*! \brief magic number used to identify Page */ - static const int kMagic = 0xffffab02; -/*! - * \brief decide the format from cache prefix. - * \return pair of row format, column format type of the cache prefix. - */ -inline std::pair DecideFormat(const std::string& cache_prefix) { - size_t pos = cache_prefix.rfind(".fmt-"); - - if (pos != std::string::npos) { - std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length()); - size_t cpos = fmt.rfind('-'); - if (cpos != std::string::npos) { - return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length())); - } else { - return std::make_pair(fmt, fmt); - } - } else { - std::string raw = "raw"; - return std::make_pair(raw, raw); - } -} - -struct CacheInfo { - std::string name_info; - std::vector format_shards; - std::vector name_shards; -}; - -inline CacheInfo ParseCacheInfo(const std::string& cache_info, const std::string& page_type) { - CacheInfo info; - std::vector cache_shards = ::detail::GetCacheShards(cache_info); - CHECK_NE(cache_shards.size(), 0U); - // read in the info files. - info.name_info = cache_shards[0]; - for (const std::string& prefix : cache_shards) { - info.name_shards.push_back(prefix + page_type); - info.format_shards.push_back(DecideFormat(prefix).first); - } - return info; -} - inline void TryDeleteCacheFile(const std::string& file) { if (std::remove(file.c_str()) != 0) { LOG(WARNING) << "Couldn't remove external memory cache file " << file @@ -150,415 +33,327 @@ inline void TryDeleteCacheFile(const std::string& file) { } } -inline void CheckCacheFileExists(const std::string& file) { - std::ifstream f(file.c_str()); - if (f.good()) { - LOG(FATAL) - << "Cache file " << file << " exists already; " - << "Is there another DMatrix with the same " - "cache prefix? It can be caused by previously used DMatrix that " - "hasn't been collected by language environment garbage collector. " - "Otherwise please remove it manually."; - } -} +struct Cache { + // whether the write to the cache is complete + bool written; + std::string name; + std::string format; + // offset into binary cache file. + std::vector offset; + + Cache(bool w, std::string n, std::string fmt) + : written{w}, name{std::move(n)}, format{std::move(fmt)} { + offset.push_back(0); + } + + static std::string ShardName(std::string name, std::string format) { + CHECK_EQ(format.front(), '.'); + return name + format; + } + + std::string ShardName() { + return ShardName(this->name, this->format); + } + + // The write is completed. + void Commit() { + if (!written) { + std::partial_sum(offset.begin(), offset.end(), offset.begin()); + written = true; + } + } +}; + +// Prevents multi-threaded call. +class TryLockGuard { + std::mutex& lock_; -/** - * \brief Given a set of cache files and page type, this object iterates over batches - * using prefetching for improved performance. Not thread safe. - * - * \tparam PageT Type of the page t. - */ -template -class ExternalMemoryPrefetcher : dmlc::DataIter { public: - explicit ExternalMemoryPrefetcher(const CacheInfo& info) noexcept(false) - : base_rowid_(0), page_(nullptr), clock_ptr_(0) { - // read in the info files - CHECK_NE(info.name_shards.size(), 0U); - { - std::unique_ptr finfo( - dmlc::Stream::Create(info.name_info.c_str(), "r")); - int tmagic; - CHECK(finfo->Read(&tmagic)); - CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch"; - } - files_.resize(info.name_shards.size()); - formats_.resize(info.name_shards.size()); - prefetchers_.resize(info.name_shards.size()); - - // read in the cache files. - for (size_t i = 0; i < info.name_shards.size(); ++i) { - std::string name_row = info.name_shards.at(i); - files_[i].reset(dmlc::SeekStream::CreateForRead(name_row.c_str())); - std::unique_ptr& fi = files_[i]; - std::string format; - CHECK(fi->Read(&format)) << "Invalid page format"; - formats_[i].reset(CreatePageFormat(format)); - std::unique_ptr>& fmt = formats_[i]; - size_t fbegin = fi->Tell(); - prefetchers_[i].reset(new dmlc::ThreadedIter(4)); - prefetchers_[i]->Init( - [&fi, &fmt](PageT** dptr) { - if (*dptr == nullptr) { - *dptr = new PageT(); - } - return fmt->Read(*dptr, fi.get()); - }, - [&fi, fbegin]() { fi->Seek(fbegin); }); - } + explicit TryLockGuard(std::mutex& lock) : lock_{lock} { // NOLINT + CHECK(lock_.try_lock()) << "Multiple threads attempting to use Sparse DMatrix."; } - /*! \brief destructor */ - ~ExternalMemoryPrefetcher() override { - delete page_; + ~TryLockGuard() { + lock_.unlock(); } +}; - // implement Next - bool Next() override { - CHECK(mutex_.try_lock()) << "Multiple threads attempting to use prefetcher"; - // doing clock rotation over shards. - if (page_ != nullptr) { - size_t n = prefetchers_.size(); - prefetchers_[(clock_ptr_ + n - 1) % n]->Recycle(&page_); - } +template +class SparsePageSourceImpl : public BatchIteratorImpl { + protected: + // Prevents calling this iterator from multiple places(or threads). + std::mutex single_threaded_; - if (prefetchers_[clock_ptr_]->Next(&page_)) { - page_->SetBaseRowId(base_rowid_); - base_rowid_ += page_->Size(); - // advance clock - clock_ptr_ = (clock_ptr_ + 1) % prefetchers_.size(); - mutex_.unlock(); - return true; - } else { - mutex_.unlock(); + std::shared_ptr page_; + + bool at_end_ {false}; + float missing_; + int nthreads_; + bst_feature_t n_features_; + + uint32_t count_{0}; + + uint32_t n_batches_ {0}; + + std::shared_ptr cache_info_; + std::unique_ptr fo_; + + using Ring = std::vector>>; + // A ring storing futures to data. Since the DMatrix iterator is forward only, so we + // can pre-fetch data in a ring. + std::unique_ptr ring_{new Ring}; + + bool ReadCache() { + CHECK(!at_end_); + if (!cache_info_->written) { return false; } - } - - // implement BeforeFirst - void BeforeFirst() override { - CHECK(mutex_.try_lock()) << "Multiple threads attempting to use prefetcher"; - base_rowid_ = 0; - clock_ptr_ = 0; - for (auto& p : prefetchers_) { - p->BeforeFirst(); + if (fo_) { + fo_.reset(); // flush the data to disk. + ring_->resize(n_batches_); } - mutex_.unlock(); + // An heuristic for number of pre-fetched batches. We can make it part of BatchParam + // to let user adjust number of pre-fetched batches when needed. + uint32_t constexpr kPreFetch = 4; + + size_t n_prefetch_batches = std::min(kPreFetch, n_batches_); + CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_; + size_t fetch_it = count_; + for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) { + fetch_it %= n_batches_; // ring + if (ring_->at(fetch_it).valid()) { continue; } + auto const *self = this; // make sure it's const + CHECK_LT(fetch_it, cache_info_->offset.size()); + ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() { + std::unique_ptr> fmt{CreatePageFormat("raw")}; + auto n = self->cache_info_->ShardName(); + size_t offset = self->cache_info_->offset.at(fetch_it); + std::unique_ptr fi{ + dmlc::SeekStream::CreateForRead(n.c_str())}; + fi->Seek(offset); + CHECK_EQ(fi->Tell(), offset); + auto page = std::make_shared(); + CHECK(fmt->Read(page.get(), fi.get())); + return page; + }); + } + CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), + [](auto const &f) { return f.valid(); }), + n_prefetch_batches) + << "Sparse DMatrix assumes forward iteration."; + page_ = (*ring_)[count_].get(); + return true; } - // implement Value - PageT& Value() { return *page_; } + void WriteCache() { + CHECK(!cache_info_->written); + std::unique_ptr> fmt{CreatePageFormat("raw")}; + if (!fo_) { + auto n = cache_info_->ShardName(); + fo_.reset(dmlc::Stream::Create(n.c_str(), "w")); + } + auto bytes = fmt->Write(*page_, fo_.get()); + cache_info_->offset.push_back(bytes); + } - const PageT& Value() const override { return *page_; } - - private: - std::mutex mutex_; - /*! \brief number of rows */ - size_t base_rowid_; - /*! \brief page currently on hold. */ - PageT* page_; - /*! \brief internal clock ptr */ - size_t clock_ptr_; - /*! \brief file pointer to the row blob file. */ - std::vector> files_; - /*! \brief Sparse page format file. */ - std::vector>> formats_; - /*! \brief internal prefetcher. */ - std::vector>> prefetchers_; -}; - - -// A data pool to keep the size of each page balanced and data partitioning to be -// deterministic. -class DataPool { - size_t inferred_num_rows_; - MetaInfo* info_; - SparsePage pool_; - size_t page_size_; - SparsePageWriter *writer_; - - void Slice(std::shared_ptr out, size_t offset, size_t n_rows, - size_t entry_offset) const; - void SplitWritePage(); + virtual void Fetch() = 0; public: - DataPool(MetaInfo *info, size_t page_size, - SparsePageWriter *writer) - : inferred_num_rows_{0}, info_{info}, - page_size_{page_size}, writer_{writer} {} + SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, + uint32_t n_batches, std::shared_ptr cache) + : missing_{missing}, nthreads_{nthreads}, n_features_{n_features}, + n_batches_{n_batches}, cache_info_{std::move(cache)} {} - void Push(std::shared_ptr page) { - info_->num_nonzero_ += page->data.Size(); - pool_.Push(*page); - if (pool_.Size() > page_size_) { - this->SplitWritePage(); + SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete; + + ~SparsePageSourceImpl() override { + for (auto& fu : *ring_) { + if (fu.valid()) { + fu.get(); + } } - page->Clear(); } - size_t Finalize(); + uint32_t Iter() const { return count_; } + + const S &operator*() const override { + CHECK(page_); + return *page_; + } + + std::shared_ptr Page() const override { + return page_; + } + + bool AtEnd() const override { + return at_end_; + } + + virtual void Reset() { + TryLockGuard guard{single_threaded_}; + at_end_ = false; + count_ = 0; + this->Fetch(); + } }; -class SparsePageSource { +#if defined(XGBOOST_USE_CUDA) +void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page); +#else +inline void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) { + common::AssertGPUSupport(); +} +#endif + +class SparsePageSource : public SparsePageSourceImpl { + DataIterProxy iter_; + DMatrixProxy* proxy_; + size_t base_row_id_ {0}; + + void Fetch() final { + page_ = std::make_shared(); + if (!this->ReadCache()) { + bool type_error { false }; + CHECK(proxy_); + HostAdapterDispatch(proxy_, [&](auto const &adapter_batch) { + page_->Push(adapter_batch, this->missing_, this->nthreads_); + }, &type_error); + if (type_error) { + DevicePush(proxy_, missing_, page_.get()); + } + page_->SetBaseRowId(base_row_id_); + base_row_id_ += page_->Size(); + n_batches_++; + this->WriteCache(); + } + } + public: - template - SparsePageSource(AdapterT* adapter, float missing, int nthread, - const std::string& cache_info, - const size_t page_size = DMatrix::kPageSize) { - const std::string page_type = ".row.page"; - cache_info_ = ParseCacheInfo(cache_info, page_type); - - // Warn user if old cache files - CheckCacheFileExists(cache_info_.name_info); - for (auto file : cache_info_.name_shards) { - CheckCacheFileExists(file); + SparsePageSource( + DataIterProxy iter, + DMatrixProxy *proxy, float missing, int nthreads, + bst_feature_t n_features, uint32_t n_batches, std::shared_ptr cache) + : SparsePageSourceImpl(missing, nthreads, n_features, n_batches, cache), + iter_{iter}, proxy_{proxy} { + if (!cache_info_->written) { + iter_.Reset(); + iter_.Next(); } - - { - SparsePageWriter writer(cache_info_.name_shards, - cache_info_.format_shards, 6); - DataPool pool(&info, page_size, &writer); - - std::shared_ptr page { new SparsePage }; - - uint64_t inferred_num_columns = 0; - uint64_t inferred_num_rows = 0; - - const uint64_t default_max = std::numeric_limits::max(); - uint64_t last_group_id = default_max; - bst_uint group_size = 0; - std::vector qids; - adapter->BeforeFirst(); - while (adapter->Next()) { - auto& batch = adapter->Value(); - if (batch.Labels() != nullptr) { - auto& labels = info.labels_.HostVector(); - labels.insert(labels.end(), batch.Labels(), - batch.Labels() + batch.Size()); - } - if (batch.Weights() != nullptr) { - auto& weights = info.weights_.HostVector(); - weights.insert(weights.end(), batch.Weights(), - batch.Weights() + batch.Size()); - } - if (batch.BaseMargin() != nullptr) { - auto& base_margin = info.base_margin_.HostVector(); - base_margin.insert(base_margin.end(), batch.BaseMargin(), - batch.BaseMargin() + batch.Size()); - } - if (batch.Qid() != nullptr) { - qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size()); - // get group - for (size_t i = 0; i < batch.Size(); ++i) { - const uint64_t cur_group_id = batch.Qid()[i]; - if (last_group_id == default_max || - last_group_id != cur_group_id) { - info.group_ptr_.push_back(group_size); - } - last_group_id = cur_group_id; - ++group_size; - } - } - CHECK_EQ(page->Size(), 0); - auto batch_max_columns = page->Push(batch, missing, nthread); - inferred_num_columns = - std::max(batch_max_columns, inferred_num_columns); - inferred_num_rows += page->Size(); - pool.Push(page); - page->SetBaseRowId(inferred_num_rows); - } - - if (last_group_id != default_max) { - if (group_size > info.group_ptr_.back()) { - info.group_ptr_.push_back(group_size); - } - } - - // Deal with empty rows/columns if necessary - if (adapter->NumColumns() == kAdapterUnknownSize) { - info.num_col_ = inferred_num_columns; - } else { - info.num_col_ = adapter->NumColumns(); - } - // Synchronise worker columns - rabit::Allreduce(&info.num_col_, 1); - - if (adapter->NumRows() == kAdapterUnknownSize) { - info.num_row_ = inferred_num_rows; - } else { - if (page->offset.HostVector().empty()) { - page->offset.HostVector().emplace_back(0); - } - - while (inferred_num_rows < adapter->NumRows()) { - page->offset.HostVector().emplace_back( - page->offset.HostVector().back()); - inferred_num_rows++; - } - info.num_row_ = adapter->NumRows(); - } - - pool.Push(page); - pool.Finalize(); - - std::unique_ptr fo( - dmlc::Stream::Create(cache_info_.name_info.c_str(), "w")); - int tmagic = kMagic; - fo->Write(tmagic); - // Either every row has query ID or none at all - CHECK(qids.empty() || qids.size() == info.num_row_); - info.SaveBinary(fo.get()); - } - LOG(INFO) << "SparsePageSource Finished writing to " - << cache_info_.name_info; - - external_prefetcher_.reset( - new ExternalMemoryPrefetcher(cache_info_)); + this->Fetch(); } - ~SparsePageSource() { - external_prefetcher_.reset(); - TryDeleteCacheFile(cache_info_.name_info); - for (auto file : cache_info_.name_shards) { - TryDeleteCacheFile(file); + SparsePageSource& operator++() final { + TryLockGuard guard{single_threaded_}; + count_++; + if (cache_info_->written) { + at_end_ = (count_ == n_batches_); + } else { + at_end_ = !iter_.Next(); } + + if (at_end_) { + cache_info_->Commit(); + if (n_batches_ != 0) { + CHECK_EQ(count_, n_batches_); + } + CHECK_GE(count_, 1); + proxy_ = nullptr; + } else { + this->Fetch(); + } + return *this; } - BatchSet GetBatchSet() { - auto begin_iter = BatchIterator( - new SparseBatchIteratorImpl, - SparsePage>(external_prefetcher_.get())); - return BatchSet(begin_iter); - } - MetaInfo info; + void Reset() override { + if (proxy_) { + TryLockGuard guard{single_threaded_}; + iter_.Reset(); + } + SparsePageSourceImpl::Reset(); - private: - std::unique_ptr> external_prefetcher_; - CacheInfo cache_info_; + TryLockGuard guard{single_threaded_}; + base_row_id_ = 0; + } }; -class CSCPageSource { +// A mixin for advancing the iterator. +template +class PageSourceIncMixIn : public SparsePageSourceImpl { + protected: + std::shared_ptr source_; + public: - CSCPageSource(DMatrix* src, const std::string& cache_info, - const size_t page_size = DMatrix::kPageSize) { - std::string page_type = ".col.page"; - cache_info_ = ParseCacheInfo(cache_info, page_type); - for (auto file : cache_info_.name_shards) { - CheckCacheFileExists(file); - } - { - SparsePageWriter writer(cache_info_.name_shards, - cache_info_.format_shards, 6); - std::shared_ptr page; - writer.Alloc(&page); - page->Clear(); + using SparsePageSourceImpl::SparsePageSourceImpl; + PageSourceIncMixIn& operator++() final { + TryLockGuard guard{this->single_threaded_}; + ++(*source_); - size_t bytes_write = 0; - double tstart = dmlc::GetTime(); - for (auto& batch : src->GetBatches()) { - page->PushCSC(batch.GetTranspose(src->Info().num_col_)); + ++this->count_; + this->at_end_ = source_->AtEnd(); - if (page->MemCostBytes() >= page_size) { - bytes_write += page->MemCostBytes(); - writer.PushWrite(std::move(page)); - writer.Alloc(&page); - page->Clear(); - double tdiff = dmlc::GetTime() - tstart; - LOG(INFO) << "Writing to " << cache_info << " in " - << ((bytes_write >> 20UL) / tdiff) << " MB/s, " - << (bytes_write >> 20UL) << " written"; - } + if (this->at_end_) { + this->cache_info_->Commit(); + if (this->n_batches_ != 0) { + CHECK_EQ(this->count_, this->n_batches_); } - if (page->data.Size() != 0) { - writer.PushWrite(std::move(page)); - } - LOG(INFO) << "CSCPageSource: Finished writing to " - << cache_info_.name_info; + CHECK_GE(this->count_, 1); + } else { + this->Fetch(); } - external_prefetcher_.reset( - new ExternalMemoryPrefetcher(cache_info_)); + CHECK_EQ(source_->Iter(), this->count_); + return *this; } - - ~CSCPageSource() { - external_prefetcher_.reset(); - for (auto file : cache_info_.name_shards) { - TryDeleteCacheFile(file); - } - } - - BatchSet GetBatchSet() { - auto begin_iter = BatchIterator( - new SparseBatchIteratorImpl, CSCPage>( - external_prefetcher_.get())); - return BatchSet(begin_iter); - } - - private: - std::unique_ptr> external_prefetcher_; - CacheInfo cache_info_; }; -class SortedCSCPageSource { +class CSCPageSource : public PageSourceIncMixIn { + protected: + void Fetch() final { + if (!this->ReadCache()) { + auto const &csr = source_->Page(); + this->page_.reset(new CSCPage{}); + // we might be able to optimize this by merging transpose and pushcsc + this->page_->PushCSC(csr->GetTranspose(n_features_)); + page_->SetBaseRowId(csr->base_rowid); + this->WriteCache(); + } + } + public: - SortedCSCPageSource(DMatrix* src, const std::string& cache_info, - const size_t page_size = DMatrix::kPageSize) { - std::string page_type = ".sorted.col.page"; - cache_info_ = ParseCacheInfo(cache_info, page_type); - for (auto file : cache_info_.name_shards) { - CheckCacheFileExists(file); - } - { - SparsePageWriter writer(cache_info_.name_shards, - cache_info_.format_shards, 6); - std::shared_ptr page; - writer.Alloc(&page); - page->Clear(); - - size_t bytes_write = 0; - double tstart = dmlc::GetTime(); - for (auto& batch : src->GetBatches()) { - SparsePage tmp = batch.GetTranspose(src->Info().num_col_); - page->PushCSC(tmp); - page->SortRows(); - - if (page->MemCostBytes() >= page_size) { - bytes_write += page->MemCostBytes(); - writer.PushWrite(std::move(page)); - writer.Alloc(&page); - page->Clear(); - double tdiff = dmlc::GetTime() - tstart; - LOG(INFO) << "Writing to " << cache_info << " in " - << ((bytes_write >> 20UL) / tdiff) << " MB/s, " - << (bytes_write >> 20UL) << " written"; - } - } - if (page->data.Size() != 0) { - writer.PushWrite(std::move(page)); - } - LOG(INFO) << "SortedCSCPageSource: Finished writing to " - << cache_info_.name_info; - } - external_prefetcher_.reset( - new ExternalMemoryPrefetcher(cache_info_)); + CSCPageSource( + float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches, + std::shared_ptr cache, + std::shared_ptr source) + : PageSourceIncMixIn(missing, nthreads, n_features, + n_batches, cache) { + this->source_ = source; + this->Fetch(); } - ~SortedCSCPageSource() { - external_prefetcher_.reset(); - for (auto file : cache_info_.name_shards) { - TryDeleteCacheFile(file); - } - } - - BatchSet GetBatchSet() { - auto begin_iter = BatchIterator( - new SparseBatchIteratorImpl, - SortedCSCPage>(external_prefetcher_.get())); - return BatchSet(begin_iter); - } - - private: - std::unique_ptr> external_prefetcher_; - CacheInfo cache_info_; }; +class SortedCSCPageSource : public PageSourceIncMixIn { + protected: + void Fetch() final { + if (!this->ReadCache()) { + auto const &csr = this->source_->Page(); + this->page_.reset(new SortedCSCPage{}); + // we might be able to optimize this by merging transpose and pushcsc + this->page_->PushCSC(csr->GetTranspose(n_features_)); + CHECK_EQ(this->page_->Size(), n_features_); + CHECK_EQ(this->page_->data.Size(), csr->data.Size()); + this->page_->SortRows(); + page_->SetBaseRowId(csr->base_rowid); + this->WriteCache(); + } + } + + public: + SortedCSCPageSource(float missing, int nthreads, bst_feature_t n_features, + uint32_t n_batches, std::shared_ptr cache, + std::shared_ptr source) + : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache) { + this->source_ = source; + this->Fetch(); + } +}; } // namespace data } // namespace xgboost #endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ diff --git a/src/data/sparse_page_writer.h b/src/data/sparse_page_writer.h index 2b079fb6c..eafbaa652 100644 --- a/src/data/sparse_page_writer.h +++ b/src/data/sparse_page_writer.h @@ -63,103 +63,6 @@ inline SparsePageFormat* CreatePageFormat(const std::string& name) { return (e->body)(); } -#if DMLC_ENABLE_STD_THREAD -/*! - * \brief A threaded writer to write sparse batch page to sharded files. - * @tparam T Type of the page. - */ -template -class SparsePageWriter { - public: - /*! - * \brief constructor - * \param name_shards name of shard files. - * \param format_shards format of each shard. - * \param extra_buffer_capacity Extra buffer capacity before block. - */ - explicit SparsePageWriter(const std::vector& name_shards, - const std::vector& format_shards, - size_t extra_buffer_capacity) - : num_free_buffer_(extra_buffer_capacity + name_shards.size()), - clock_ptr_(0), - workers_(name_shards.size()), - qworkers_(name_shards.size()) { - CHECK_EQ(name_shards.size(), format_shards.size()); - // start writer threads - for (size_t i = 0; i < name_shards.size(); ++i) { - std::string name_shard = name_shards[i]; - std::string format_shard = format_shards[i]; - auto* wqueue = &qworkers_[i]; - workers_[i].reset(new std::thread( - [this, name_shard, format_shard, wqueue]() { - std::unique_ptr fo(dmlc::Stream::Create(name_shard.c_str(), "w")); - std::unique_ptr> fmt(CreatePageFormat(format_shard)); - fo->Write(format_shard); - std::shared_ptr page; - while (wqueue->Pop(&page)) { - if (page == nullptr) break; - fmt->Write(*page, fo.get()); - qrecycle_.Push(std::move(page)); - } - fo.reset(nullptr); - LOG(INFO) << "SparsePageWriter Finished writing to " << name_shard; - })); - } - } - - /*! \brief destructor, will close the files automatically */ - ~SparsePageWriter() { - for (auto& queue : qworkers_) { - // use nullptr to signal termination. - std::shared_ptr sig(nullptr); - queue.Push(std::move(sig)); - } - for (auto& thread : workers_) { - thread->join(); - } - } - - /*! - * \brief Push a write job to the writer. - * This function won't block, - * writing is done by another thread inside writer. - * \param page The page to be written - */ - void PushWrite(std::shared_ptr&& page) { - qworkers_[clock_ptr_].Push(std::move(page)); - clock_ptr_ = (clock_ptr_ + 1) % workers_.size(); - } - - /*! - * \brief Allocate a page to store results. - * This function can block when the writer is too slow and buffer pages - * have not yet been recycled. - * \param out_page Used to store the allocated pages. - */ - void Alloc(std::shared_ptr* out_page) { - CHECK(*out_page == nullptr); - if (num_free_buffer_ != 0) { - out_page->reset(new T()); - --num_free_buffer_; - } else { - CHECK(qrecycle_.Pop(out_page)); - } - } - - private: - /*! \brief number of allocated pages */ - size_t num_free_buffer_; - /*! \brief clock_pointer */ - size_t clock_ptr_; - /*! \brief writer threads */ - std::vector> workers_; - /*! \brief recycler queue */ - dmlc::ConcurrentBlockingQueue> qrecycle_; - /*! \brief worker threads */ - std::vector>> qworkers_; -}; -#endif // DMLC_ENABLE_STD_THREAD - /*! * \brief Registry entry for sparse page format. */ diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 3caa83f13..843341c36 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -131,7 +131,7 @@ class PoissonSampling : public thrust::binary_function gpair, DMatrix* dmat) { return {dmat->Info().num_row_, page_, gpair}; } -ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page, +ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param) : batch_param_(batch_param), @@ -171,7 +171,7 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span return {dmat->Info().num_row_, page_.get(), gpair}; } -UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample) +UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample) : page_(page), subsample_(subsample) {} GradientBasedSample UniformSampling::Sample(common::Span gpair, DMatrix* dmat) { @@ -183,7 +183,7 @@ GradientBasedSample UniformSampling::Sample(common::Span gpair, DM return {dmat->Info().num_row_, page_, gpair}; } -ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page, +ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param, float subsample) @@ -231,7 +231,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span gpa } ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling( - EllpackPageImpl* page, + EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param, float subsample) @@ -313,7 +313,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span gpair; }; @@ -31,17 +31,17 @@ class SamplingStrategy { /*! \brief No sampling in in-memory mode. */ class NoSampling : public SamplingStrategy { public: - explicit NoSampling(EllpackPageImpl* page); + explicit NoSampling(EllpackPageImpl const* page); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl* page_; + EllpackPageImpl const* page_; }; /*! \brief No sampling in external memory mode. */ class ExternalMemoryNoSampling : public SamplingStrategy { public: - ExternalMemoryNoSampling(EllpackPageImpl* page, + ExternalMemoryNoSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; @@ -55,25 +55,25 @@ class ExternalMemoryNoSampling : public SamplingStrategy { /*! \brief Uniform sampling in in-memory mode. */ class UniformSampling : public SamplingStrategy { public: - UniformSampling(EllpackPageImpl* page, float subsample); + UniformSampling(EllpackPageImpl const* page, float subsample); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl* page_; + EllpackPageImpl const* page_; float subsample_; }; /*! \brief No sampling in external memory mode. */ class ExternalMemoryUniformSampling : public SamplingStrategy { public: - ExternalMemoryUniformSampling(EllpackPageImpl* page, + ExternalMemoryUniformSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param, float subsample); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl* original_page_; + EllpackPageImpl const* original_page_; BatchParam batch_param_; float subsample_; std::unique_ptr page_; @@ -84,14 +84,14 @@ class ExternalMemoryUniformSampling : public SamplingStrategy { /*! \brief Gradient-based sampling in in-memory mode.. */ class GradientBasedSampling : public SamplingStrategy { public: - GradientBasedSampling(EllpackPageImpl* page, + GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param, float subsample); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl* page_; + EllpackPageImpl const* page_; float subsample_; dh::caching_device_vector threshold_; dh::caching_device_vector grad_sum_; @@ -100,14 +100,14 @@ class GradientBasedSampling : public SamplingStrategy { /*! \brief Gradient-based sampling in external memory mode.. */ class ExternalMemoryGradientBasedSampling : public SamplingStrategy { public: - ExternalMemoryGradientBasedSampling(EllpackPageImpl* page, + ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param, float subsample); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl* original_page_; + EllpackPageImpl const* original_page_; BatchParam batch_param_; float subsample_; dh::caching_device_vector threshold_; @@ -128,7 +128,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy { */ class GradientBasedSampler { public: - GradientBasedSampler(EllpackPageImpl* page, + GradientBasedSampler(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param, float subsample, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index b1bf0af56..9ba233334 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -163,7 +163,7 @@ class DeviceHistogram { template struct GPUHistMakerDevice { int device_id; - EllpackPageImpl* page; + EllpackPageImpl const* page; common::Span feature_types; BatchParam batch_param; @@ -199,7 +199,7 @@ struct GPUHistMakerDevice { dh::caching_device_vector node_categories; GPUHistMakerDevice(int _device_id, - EllpackPageImpl* _page, + EllpackPageImpl const* _page, common::Span _feature_types, bst_uint _n_rows, TrainParam _param, @@ -488,7 +488,7 @@ struct GPUHistMakerDevice { } } - void FinalisePositionInPage(EllpackPageImpl *page, + void FinalisePositionInPage(EllpackPageImpl const *page, const common::Span d_nodes, common::Span d_feature_types, common::Span categories, @@ -812,7 +812,6 @@ class GPUHistMakerSpecialised { BatchParam batch_param{ device_, param_.max_bin, - generic_param_->gpu_page_size }; auto page = (*dmat->GetBatches(batch_param).begin()).Impl(); dh::safe_cuda(cudaSetDevice(device_)); diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc index c3eaf1008..5d5f03afd 100644 --- a/tests/cpp/common/test_column_matrix.cc +++ b/tests/cpp/common/test_column_matrix.cc @@ -125,12 +125,10 @@ TEST(DenseColumnWithMissing, Test) { } void TestGHistIndexMatrixCreation(size_t nthreads) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; /* This should create multiple sparse pages */ - std::unique_ptr dmat{ CreateSparsePageDMatrix(kEntries, kPageSize, filename) }; + std::unique_ptr dmat{ CreateSparsePageDMatrix(kEntries) }; omp_set_num_threads(nthreads); GHistIndexMatrix gmat(dmat.get(), 256); } diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index a8636c854..aa91f9c29 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -83,7 +83,7 @@ inline std::shared_ptr GetExternalMemoryDMatrixFromData( } fo.close(); return std::shared_ptr(DMatrix::Load( - tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)); + tmp_file + "#" + tmp_file + ".cache", true, false, "auto")); } // Test that elements are approximately equally distributed among bins diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc index 195dd6965..6c1b42571 100644 --- a/tests/cpp/data/test_data.cc +++ b/tests/cpp/data/test_data.cc @@ -59,12 +59,9 @@ TEST(SparsePage, PushCSC) { } TEST(SparsePage, PushCSCAfterTranspose) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; - std::unique_ptr dmat = - CreateSparsePageDMatrix(kEntries, 64UL, filename); + std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries); const int ncols = dmat->Info().num_col_; SparsePage page; // Consolidated sparse page for (const auto &batch : dmat->GetBatches()) { @@ -76,12 +73,12 @@ TEST(SparsePage, PushCSCAfterTranspose) { // Make sure that the final sparse page has the right number of entries ASSERT_EQ(kEntries, page.data.Size()); - // The feature value for a feature in each row should be identical, as that is - // how the dmatrix has been created - for (size_t i = 0; i < page.Size(); ++i) { - auto inst = page.GetView()[i]; - for (size_t j = 1; j < inst.size(); ++j) { - ASSERT_EQ(inst[0].fvalue, inst[j].fvalue); + page.SortRows(); + auto v = page.GetView(); + for (size_t i = 0; i < v.Size(); ++i) { + auto column = v[i]; + for (size_t j = 1; j < column.size(); ++j) { + ASSERT_GE(column[j].fvalue, column[j-1].fvalue); } } } diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index 6b5b70814..c73dd9910 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -142,7 +142,7 @@ TEST(EllpackPage, Copy) { dmlc::TemporaryDirectory tmpdir; std::unique_ptr dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); - BatchParam param{0, 256, kPageSize}; + BatchParam param{0, 256}; auto page = (*dmat->GetBatches(param).begin()).Impl(); // Create an empty result page. @@ -188,7 +188,7 @@ TEST(EllpackPage, Compact) { dmlc::TemporaryDirectory tmpdir; std::unique_ptr dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); - BatchParam param{0, 256, kPageSize}; + BatchParam param{0, 256}; auto page = (*dmat->GetBatches(param).begin()).Impl(); // Create an empty result page. @@ -212,7 +212,7 @@ TEST(EllpackPage, Compact) { std::vector row_result(kCols); for (auto& page : dmat->GetBatches(param)) { auto impl = page.Impl(); - EXPECT_EQ(impl->base_rowid, current_row); + ASSERT_EQ(impl->base_rowid, current_row); for (size_t i = 0; i < impl->Size(); i++) { size_t compacted_row = row_indexes_h[current_row]; diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc new file mode 100644 index 000000000..12ae9e726 --- /dev/null +++ b/tests/cpp/data/test_file_iterator.cc @@ -0,0 +1,46 @@ +/*! + * Copyright 2021 XGBoost contributors + */ +#include +#include + +#include + +#include "../../../src/data/file_iterator.h" +#include "../../../src/data/proxy_dmatrix.h" +#include "../../../src/data/adapter.h" +#include "../helpers.h" + +namespace xgboost { +namespace data { +TEST(FileIterator, Basic) { + auto check_n_features = [](FileIterator *iter) { + size_t n_features = 0; + iter->Reset(); + while (iter->Next()) { + auto proxy = MakeProxy(iter->Proxy()); + auto csr = dmlc::get>(proxy->Adapter()); + n_features = std::max(n_features, csr->NumColumns()); + } + ASSERT_EQ(n_features, 5); + }; + + dmlc::TemporaryDirectory tmpdir; + { + auto zpath = tmpdir.path + "/0-based.svm"; + CreateBigTestData(zpath, 3 * 64, true); + zpath += "?indexing_mode=0"; + FileIterator iter{zpath, 0, 1, "libsvm"}; + check_n_features(&iter); + } + + { + auto opath = tmpdir.path + "/1-based.svm"; + CreateBigTestData(opath, 3 * 64, false); + opath += "?indexing_mode=1"; + FileIterator iter{opath, 0, 1, "libsvm"}; + check_n_features(&iter); + } +} +} // namespace data +} // namespace xgboost diff --git a/tests/cpp/data/test_iterative_device_dmatrix.cu b/tests/cpp/data/test_iterative_device_dmatrix.cu index 3e318221e..cb64a3b5c 100644 --- a/tests/cpp/data/test_iterative_device_dmatrix.cu +++ b/tests/cpp/data/test_iterative_device_dmatrix.cu @@ -142,7 +142,7 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) { IterativeDeviceDMatrix m( &iter, iter.Proxy(), Reset, Next, std::numeric_limits::quiet_NaN(), 0, 256); - auto &ellpack = *m.GetBatches({0, 256, 0}).begin(); + auto &ellpack = *m.GetBatches({0, 256}).begin(); auto impl = ellpack.Impl(); common::CompressedIterator iterator( impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index 355ea8a70..ad59ba4f5 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -260,7 +260,7 @@ TEST(MetaInfo, HostExtend) { lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size()); rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size()); - lhs.Extend(rhs, true); + lhs.Extend(rhs, true, true); ASSERT_EQ(lhs.num_row_, kRows * 2); ASSERT_TRUE(lhs.labels_.HostCanRead()); ASSERT_TRUE(rhs.labels_.HostCanRead()); diff --git a/tests/cpp/data/test_metainfo.cu b/tests/cpp/data/test_metainfo.cu index 85860096e..090374b91 100644 --- a/tests/cpp/data/test_metainfo.cu +++ b/tests/cpp/data/test_metainfo.cu @@ -141,7 +141,7 @@ TEST(MetaInfo, DeviceExtend) { lhs.num_row_ = kRows; rhs.num_row_ = kRows; - lhs.Extend(rhs, true); + lhs.Extend(rhs, true, true); ASSERT_EQ(lhs.num_row_, kRows * 2); ASSERT_FALSE(lhs.labels_.HostCanRead()); diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index f20e259fd..0ae69a67f 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -6,11 +6,100 @@ #include #include "../../../src/common/io.h" #include "../../../src/data/adapter.h" +#include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/sparse_page_dmatrix.h" +#include "../../../src/data/file_iterator.h" #include "../helpers.h" using namespace xgboost; // NOLINT +template +void TestSparseDMatrixLoadFile() { + dmlc::TemporaryDirectory tmpdir; + auto opath = tmpdir.path + "/1-based.svm"; + CreateBigTestData(opath, 3 * 64, false); + opath += "?indexing_mode=1"; + data::FileIterator iter{opath, 0, 1, "libsvm"}; + data::SparsePageDMatrix m{&iter, + iter.Proxy(), + data::fileiter::Reset, + data::fileiter::Next, + std::numeric_limits::quiet_NaN(), + 1, + "cache"}; + ASSERT_EQ(m.Info().num_col_, 5); + ASSERT_EQ(m.Info().num_row_, 64); + + std::unique_ptr> parser( + dmlc::Parser::Create(opath.c_str(), 0, 1, "auto")); + auto adapter = data::FileAdapter{parser.get()}; + + data::SimpleDMatrix simple{&adapter, std::numeric_limits::quiet_NaN(), + 1}; + Page out; + for (auto const& page : m.GetBatches()) { + if (std::is_same::value) { + out.Push(page); + } else { + out.PushCSC(page); + } + } + ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_); + ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_); + + for (auto const& page : simple.GetBatches()) { + ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector()); + for (size_t i = 0; i < page.data.Size(); ++i) { + ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue); + } + } +} + +TEST(SparsePageDMatrix, LoadFile) { + TestSparseDMatrixLoadFile(); + TestSparseDMatrixLoadFile(); + TestSparseDMatrixLoadFile(); +} + +// allow caller to retain pages so they can process multiple pages at the same time. +template +void TestRetainPage() { + auto m = CreateSparsePageDMatrix(10000); + auto batches = m->GetBatches(); + auto begin = batches.begin(); + auto end = batches.end(); + + std::vector pages; + std::vector> iterators; + for (auto it = begin; it != end; ++it) { + iterators.push_back(it.Page()); + pages.emplace_back(Page{}); + if (std::is_same::value) { + pages.back().Push(*it); + } else { + pages.back().PushCSC(*it); + } + ASSERT_EQ(pages.back().Size(), (*it).Size()); + } + ASSERT_GE(iterators.size(), 2); + + for (size_t i = 0; i < iterators.size(); ++i) { + ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size()); + ASSERT_EQ((*iterators[i]).data.HostVector(), pages.at(i).data.HostVector()); + } + + // make sure it's const and the caller can not modify the content of page. + for (auto& page : m->GetBatches()) { + static_assert(std::is_const>::value, ""); + } +} + +TEST(SparsePageDMatrix, RetainSparsePage) { + TestRetainPage(); + TestRetainPage(); + TestRetainPage(); +} + TEST(SparsePageDMatrix, MetaInfo) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; @@ -19,8 +108,6 @@ TEST(SparsePageDMatrix, MetaInfo) { xgboost::DMatrix *dmat = xgboost::DMatrix::Load( tmp_file + "#" + tmp_file + ".cache", false, false); - std::cout << tmp_file << std::endl; - EXPECT_TRUE(FileExists(tmp_file + ".cache")); // Test the metadata that was parsed EXPECT_EQ(dmat->Info().num_row_, 8ul); @@ -32,10 +119,7 @@ TEST(SparsePageDMatrix, MetaInfo) { } TEST(SparsePageDMatrix, RowAccess) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; - std::unique_ptr dmat = - xgboost::CreateSparsePageDMatrix(24, 4, filename); + std::unique_ptr dmat = xgboost::CreateSparsePageDMatrix(24); // Test the data read into the first row auto &batch = *dmat->GetBatches().begin(); @@ -43,7 +127,7 @@ TEST(SparsePageDMatrix, RowAccess) { auto first_row = page[0]; ASSERT_EQ(first_row.size(), 3ul); EXPECT_EQ(first_row[2].index, 2u); - EXPECT_EQ(first_row[2].fvalue, 20); + EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4); } TEST(SparsePageDMatrix, ColAccess) { @@ -54,55 +138,46 @@ TEST(SparsePageDMatrix, ColAccess) { xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false); // Loop over the batches and assert the data is as expected + size_t iter = 0; for (auto const &col_batch : dmat->GetBatches()) { auto col_page = col_batch.GetView(); - EXPECT_EQ(col_page.Size(), dmat->Info().num_col_); - EXPECT_EQ(col_page[1][0].fvalue, 10.0f); - EXPECT_EQ(col_page[1].size(), 1); + ASSERT_EQ(col_page.Size(), dmat->Info().num_col_); + if (iter == 1) { + ASSERT_EQ(col_page[0][0].fvalue, 0.f); + ASSERT_EQ(col_page[3][0].fvalue, 30.f); + ASSERT_EQ(col_page[3][0].index, 1); + ASSERT_EQ(col_page[3].size(), 1); + } else { + ASSERT_EQ(col_page[1][0].fvalue, 10.0f); + ASSERT_EQ(col_page[1].size(), 1); + } + CHECK_LE(col_batch.base_rowid, dmat->Info().num_row_); + ++iter; } // Loop over the batches and assert the data is as expected + iter = 0; for (auto const &col_batch : dmat->GetBatches()) { auto col_page = col_batch.GetView(); EXPECT_EQ(col_page.Size(), dmat->Info().num_col_); - EXPECT_EQ(col_page[1][0].fvalue, 10.0f); - EXPECT_EQ(col_page[1].size(), 1); + if (iter == 0) { + EXPECT_EQ(col_page[1][0].fvalue, 10.0f); + EXPECT_EQ(col_page[1].size(), 1); + } else { + EXPECT_EQ(col_page[3][0].fvalue, 30.f); + EXPECT_EQ(col_page[3].size(), 1); + } + iter++; } - - EXPECT_TRUE(FileExists(tmp_file + ".cache")); - EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); - EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page")); - EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page")); - delete dmat; - - EXPECT_FALSE(FileExists(tmp_file + ".cache")); - EXPECT_FALSE(FileExists(tmp_file + ".cache.row.page")); - EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page")); - EXPECT_FALSE(FileExists(tmp_file + ".cache.sorted.col.page")); -} - -TEST(SparsePageDMatrix, ExistingCacheFile) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; - size_t constexpr kPageSize = 64, kEntriesPerCol = 3; - size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; - std::unique_ptr dmat = - xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename); - EXPECT_ANY_THROW({ - std::unique_ptr dmat2 = - xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename); - }); } TEST(SparsePageDMatrix, ThreadSafetyException) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/test"; - size_t constexpr kPageSize = 64, kEntriesPerCol = 3; - size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; + size_t constexpr kEntriesPerCol = 3; + size_t constexpr kEntries = 64 * kEntriesPerCol * 2; std::unique_ptr dmat = - xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename); + xgboost::CreateSparsePageDMatrix(kEntries); int threads = 1000; @@ -134,13 +209,10 @@ TEST(SparsePageDMatrix, ThreadSafetyException) { // Multi-batches access TEST(SparsePageDMatrix, ColAccessBatches) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; // Create multiple sparse pages - std::unique_ptr dmat{ - xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename)}; + std::unique_ptr dmat{xgboost::CreateSparsePageDMatrix(kEntries)}; auto n_threads = omp_get_max_threads(); omp_set_num_threads(16); for (auto const &page : dmat->GetBatches()) { @@ -149,234 +221,37 @@ TEST(SparsePageDMatrix, ColAccessBatches) { omp_set_num_threads(n_threads); } -TEST(SparsePageDMatrix, Empty) { - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - std::vector data{}; - std::vector feature_idx = {}; - std::vector row_ptr = {}; - - { - data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), - data.data(), 0, 0, 0); - data::SparsePageDMatrix dmat( - &csr_adapter, std::numeric_limits::quiet_NaN(), 1, tmp_file); - EXPECT_EQ(dmat.Info().num_nonzero_, 0); - EXPECT_EQ(dmat.Info().num_row_, 0); - EXPECT_EQ(dmat.Info().num_col_, 0); - for (auto &batch : dmat.GetBatches()) { - EXPECT_EQ(batch.Size(), 0); - } - } - - { - data::DenseAdapter dense_adapter(nullptr, 0, 0); - data::SparsePageDMatrix dmat2( - &dense_adapter, std::numeric_limits::quiet_NaN(), 1, tmp_file); - EXPECT_EQ(dmat2.Info().num_nonzero_, 0); - EXPECT_EQ(dmat2.Info().num_row_, 0); - EXPECT_EQ(dmat2.Info().num_col_, 0); - for (auto &batch : dmat2.GetBatches()) { - EXPECT_EQ(batch.Size(), 0); - } - } - { - data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0); - data::SparsePageDMatrix dmat3( - &csc_adapter, std::numeric_limits::quiet_NaN(), 1, tmp_file); - EXPECT_EQ(dmat3.Info().num_nonzero_, 0); - EXPECT_EQ(dmat3.Info().num_row_, 0); - EXPECT_EQ(dmat3.Info().num_col_, 0); - for (auto &batch : dmat3.GetBatches()) { - EXPECT_EQ(batch.Size(), 0); - } - } -} - -TEST(SparsePageDMatrix, MissingData) { - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - std::vector data{0.0, std::nanf(""), 1.0}; - std::vector feature_idx = {0, 1, 0}; - std::vector row_ptr = {0, 2, 3}; - - data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, - 3, 2); - data::SparsePageDMatrix dmat( - &adapter, std::numeric_limits::quiet_NaN(), 1, tmp_file); - EXPECT_EQ(dmat.Info().num_nonzero_, 2); - - const std::string tmp_file2 = tempdir.path + "/simple2.libsvm"; - data::SparsePageDMatrix dmat2(&adapter, 1.0, 1, tmp_file2); - EXPECT_EQ(dmat2.Info().num_nonzero_, 1); -} - -TEST(SparsePageDMatrix, EmptyRow) { - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - std::vector data{0.0, 1.0}; - std::vector feature_idx = {0, 1}; - std::vector row_ptr = {0, 2, 2}; - - data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, - 2, 2); - data::SparsePageDMatrix dmat( - &adapter, std::numeric_limits::quiet_NaN(), 1, tmp_file); - EXPECT_EQ(dmat.Info().num_nonzero_, 2); - EXPECT_EQ(dmat.Info().num_row_, 2); - EXPECT_EQ(dmat.Info().num_col_, 2); -} - -TEST(SparsePageDMatrix, FromDense) { - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - int m = 3; - int n = 2; - std::vector data = {1, 2, 3, 4, 5, 6}; - data::DenseAdapter adapter(data.data(), m, n); - data::SparsePageDMatrix dmat( - &adapter, std::numeric_limits::quiet_NaN(), 1, tmp_file); - EXPECT_EQ(dmat.Info().num_col_, 2); - EXPECT_EQ(dmat.Info().num_row_, 3); - EXPECT_EQ(dmat.Info().num_nonzero_, 6); - - for (auto &batch : dmat.GetBatches()) { - auto page = batch.GetView(); - for (auto i = 0ull; i < batch.Size(); i++) { - auto inst = page[i]; - for (auto j = 0ull; j < inst.size(); j++) { - EXPECT_EQ(inst[j].fvalue, data[i * n + j]); - EXPECT_EQ(inst[j].index, j); - } - } - } -} - -TEST(SparsePageDMatrix, FromCSC) { - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - std::vector data = {1, 3, 2, 4, 5}; - std::vector row_idx = {0, 1, 0, 1, 2}; - std::vector col_ptr = {0, 2, 5}; - data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3); - data::SparsePageDMatrix dmat( - &adapter, std::numeric_limits::quiet_NaN(), -1, tmp_file); - EXPECT_EQ(dmat.Info().num_col_, 2); - EXPECT_EQ(dmat.Info().num_row_, 3); - EXPECT_EQ(dmat.Info().num_nonzero_, 5); - - auto &batch = *dmat.GetBatches().begin(); - auto page = batch.GetView(); - auto inst = page[0]; - EXPECT_EQ(inst[0].fvalue, 1); - EXPECT_EQ(inst[0].index, 0); - EXPECT_EQ(inst[1].fvalue, 2); - EXPECT_EQ(inst[1].index, 1); - - inst = page[1]; - EXPECT_EQ(inst[0].fvalue, 3); - EXPECT_EQ(inst[0].index, 0); - EXPECT_EQ(inst[1].fvalue, 4); - EXPECT_EQ(inst[1].index, 1); - - inst = page[2]; - EXPECT_EQ(inst[0].fvalue, 5); - EXPECT_EQ(inst[0].index, 1); -} - -TEST(SparsePageDMatrix, FromFile) { - std::string filename = "test.libsvm"; - CreateBigTestData(filename, 20); - std::unique_ptr> parser( - dmlc::Parser::Create(filename.c_str(), 0, 1, "auto")); - data::FileAdapter adapter(parser.get()); - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - - data::SparsePageDMatrix dmat( - &adapter, std::numeric_limits::quiet_NaN(), -1, tmp_file, 1); - ASSERT_EQ(dmat.Info().num_col_, 5); - - for (auto &batch : dmat.GetBatches()) { - std::vector expected_offset(batch.Size() + 1); - auto page = batch.GetView(); - int n = -3; - std::generate(expected_offset.begin(), expected_offset.end(), - [&n] { return n += 3; }); - EXPECT_EQ(batch.offset.HostVector(), expected_offset); - - if (batch.base_rowid % 2 == 0) { - EXPECT_EQ(page[0][0].index, 0); - EXPECT_EQ(page[0][1].index, 1); - EXPECT_EQ(page[0][2].index, 2); - } else { - EXPECT_EQ(page[0][0].index, 0); - EXPECT_EQ(page[0][1].index, 3); - EXPECT_EQ(page[0][2].index, 4); - } - } -} - -TEST(SparsePageDMatrix, Large) { - std::string filename = "test.libsvm"; - CreateBigTestData(filename, 1 << 16); - std::unique_ptr> parser( - dmlc::Parser::Create(filename.c_str(), 0, 1, "auto")); - data::FileAdapter adapter(parser.get()); - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - - std::unique_ptr sparse{new data::SparsePageDMatrix( - &adapter, std::numeric_limits::quiet_NaN(), -1, tmp_file, 16)}; - std::unique_ptr simple{DMatrix::Load(filename, true, true)}; - - std::vector sparse_data; - std::vector sparse_rptr; - std::vector sparse_cids; - DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids); - - std::vector simple_data; - std::vector simple_rptr; - std::vector simple_cids; - DMatrixToCSR(simple.get(), &simple_data, &simple_rptr, &simple_cids); - - ASSERT_EQ(sparse_rptr.size(), sparse->Info().num_row_ + 1); - ASSERT_EQ(sparse_rptr.size(), simple->Info().num_row_ + 1); - - ASSERT_EQ(sparse_data.size(), simple_data.size()); - ASSERT_EQ(sparse_data, simple_data); - ASSERT_EQ(sparse_rptr.size(), simple_rptr.size()); - ASSERT_EQ(sparse_rptr, simple_rptr); - ASSERT_EQ(sparse_cids, simple_cids); -} - -auto TestSparsePageDMatrixDeterminism(int32_t threads, std::string const& filename) { +auto TestSparsePageDMatrixDeterminism(int32_t threads) { omp_set_num_threads(threads); std::vector sparse_data; std::vector sparse_rptr; std::vector sparse_cids; - - std::unique_ptr> parser( - dmlc::Parser::Create(filename.c_str(), 0, 1, "auto")); - data::FileAdapter adapter(parser.get()); dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - std::unique_ptr sparse{new data::SparsePageDMatrix( - &adapter, std::numeric_limits::quiet_NaN(), -1, tmp_file, 1 << 8)}; + std::string filename = tempdir.path + "/simple.libsvm"; + CreateBigTestData(filename, 1 << 16); + + data::FileIterator iter(filename, 0, 1, "auto"); + std::unique_ptr sparse{new data::SparsePageDMatrix{ + &iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next, + std::numeric_limits::quiet_NaN(), 1, filename}}; DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids); - std::string cache_name = tmp_file + ".row.page"; + auto cache_name = + data::MakeId(filename, + dynamic_cast(sparse.get())) + + ".row.page"; std::string cache = common::LoadSequentialFile(cache_name); return cache; } TEST(SparsePageDMatrix, Determinism) { - std::string filename = "test.libsvm"; - CreateBigTestData(filename, 1 << 16); +#if defined(_MSC_VER) + return; +#endif // defined(_MSC_VER) std::vector caches; for (size_t i = 1; i < 18; i += 2) { - caches.emplace_back(TestSparsePageDMatrixDeterminism(i, filename)); + caches.emplace_back(TestSparsePageDMatrixDeterminism(i)); } for (size_t i = 1; i < caches.size(); ++i) { diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index 26058a836..d598b420e 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -4,6 +4,7 @@ #include "../helpers.h" #include "../../../src/common/compressed_iterator.h" #include "../../../src/data/ellpack_page.cuh" +#include "../../../src/data/sparse_page_dmatrix.h" namespace xgboost { @@ -14,13 +15,22 @@ TEST(SparsePageDMatrix, EllpackPage) { DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false); // Loop over the batches and assert the data is as expected - for (const auto& batch : dmat->GetBatches({0, 256, 64})) { - EXPECT_EQ(batch.Size(), dmat->Info().num_row_); + size_t n = 0; + for (const auto& batch : dmat->GetBatches({0, 256})) { + n += batch.Size(); } + EXPECT_EQ(n, dmat->Info().num_row_); - EXPECT_TRUE(FileExists(tmp_file + ".cache")); - EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); - EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page")); + auto path = + data::MakeId(tmp_file + ".cache", + dynamic_cast(dmat)) + + ".row.page"; + EXPECT_TRUE(FileExists(path)); + path = + data::MakeId(tmp_file + ".cache", + dynamic_cast(dmat)) + + ".ellpack.page"; + EXPECT_TRUE(FileExists(path)); delete dmat; } @@ -30,12 +40,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) { std::string filename = tmpdir.path + "/big.libsvm"; size_t constexpr kPageSize = 64, kEntriesPerCol = 3; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; - std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename); + std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries, filename); // Loop over the batches and count the records int64_t batch_count = 0; int64_t row_count = 0; - for (const auto& batch : dmat->GetBatches({0, 256, 7UL})) { + for (const auto& batch : dmat->GetBatches({0, 256})) { EXPECT_LT(batch.Size(), dmat->Info().num_row_); batch_count++; row_count += batch.Size(); @@ -43,7 +53,36 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) { EXPECT_GE(batch_count, 2); EXPECT_EQ(row_count, dmat->Info().num_row_); - EXPECT_TRUE(FileExists(filename + ".cache.ellpack.page")); + auto path = + data::MakeId(filename, + dynamic_cast(dmat.get())) + + ".ellpack.page"; +} + +TEST(SparsePageDMatrix, RetainEllpackPage) { + auto m = CreateSparsePageDMatrix(10000); + auto batches = m->GetBatches({0, 32}); + auto begin = batches.begin(); + auto end = batches.end(); + + std::vector> gidx_buffers; + std::vector> iterators; + for (auto it = begin; it != end; ++it) { + iterators.push_back(it.Page()); + gidx_buffers.emplace_back(HostDeviceVector{}); + gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size()); + gidx_buffers.back().Copy((*it).Impl()->gidx_buffer); + } + ASSERT_GE(iterators.size(), 2); + + for (size_t i = 0; i < iterators.size(); ++i) { + ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector()); + } + + // make sure it's const and the caller can not modify the content of page. + for (auto& page : m->GetBatches({0, 32})) { + static_assert(std::is_const>::value, ""); + } } TEST(SparsePageDMatrix, EllpackPageContent) { @@ -59,7 +98,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) { std::unique_ptr dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); - BatchParam param{0, 2, 0}; + BatchParam param{0, 2}; auto impl = (*dmat->GetBatches(param).begin()).Impl(); EXPECT_EQ(impl->base_rowid, 0); EXPECT_EQ(impl->n_rows, kRows); @@ -67,7 +106,17 @@ TEST(SparsePageDMatrix, EllpackPageContent) { EXPECT_EQ(impl->row_stride, 2); EXPECT_EQ(impl->Cuts().TotalBins(), 4); - auto impl_ext = (*dmat_ext->GetBatches(param).begin()).Impl(); + std::unique_ptr impl_ext; + size_t offset = 0; + for (auto& batch : dmat_ext->GetBatches(param)) { + if (!impl_ext) { + impl_ext.reset(new EllpackPageImpl( + batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(), + batch.Impl()->is_dense, batch.Impl()->row_stride, kRows)); + } + auto n_elems = impl_ext->Copy(0, batch.Impl(), offset); + offset += n_elems; + } EXPECT_EQ(impl_ext->base_rowid, 0); EXPECT_EQ(impl_ext->n_rows, kRows); EXPECT_FALSE(impl_ext->is_dense); @@ -109,7 +158,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) { std::unique_ptr dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); - BatchParam param{0, kMaxBins, kPageSize}; + BatchParam param{0, kMaxBins}; auto impl = (*dmat->GetBatches(param).begin()).Impl(); EXPECT_EQ(impl->base_rowid, 0); EXPECT_EQ(impl->n_rows, kRows); @@ -150,7 +199,7 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) { std::unique_ptr dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir)); - BatchParam param{0, kMaxBins, kPageSize}; + BatchParam param{0, kMaxBins}; size_t current_row = 0; for (auto& page : dmat_ext->GetBatches(param)) { diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index 749dc4b66..7627dbf6e 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -155,7 +155,8 @@ TEST(GBTree, ChoosePredictor) { ASSERT_TRUE(data.HostCanWrite()); // pull data into device. - data = HostDeviceVector(data.HostVector(), 0); + data.HostVector(); + data.SetDevice(0); data.DeviceSpan(); ASSERT_FALSE(data.HostCanWrite()); diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 83a250e10..5f791eeef 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -18,6 +18,7 @@ #include "xgboost/c_api.h" #include "../../src/data/adapter.h" #include "../../src/data/simple_dmatrix.h" +#include "../../src/data/sparse_page_dmatrix.h" #include "../../src/gbm/gbtree_model.h" #include "xgboost/predictor.h" @@ -45,12 +46,25 @@ void CreateSimpleTestData(const std::string& filename) { CreateBigTestData(filename, 6); } -void CreateBigTestData(const std::string& filename, size_t n_entries) { +void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based) { std::ofstream fo(filename.c_str()); const size_t entries_per_row = 3; + std::string odd_row; + if (zero_based) { + odd_row = " 0:0 3:30 4:40\n"; + } else { + odd_row = " 1:0 4:30 5:40\n"; + } + std::string even_row; + if (zero_based) { + even_row = " 0:0 1:10 2:20\n"; + } else { + even_row = " 1:0 2:10 3:20\n"; + } + size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row; for (size_t i = 0; i < n_rows; ++i) { - const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n"; + auto row = i % 2 == 0 ? even_row : odd_row; fo << i << row; } } @@ -348,13 +362,20 @@ GetDMatrixFromData(const std::vector &x, int num_rows, int num_columns){ &adapter, std::numeric_limits::quiet_NaN(), 1)); } -std::unique_ptr CreateSparsePageDMatrix( - size_t n_entries, size_t page_size, std::string tmp_file) { - // Create sufficiently large data to make two row pages - CreateBigTestData(tmp_file, n_entries); - std::unique_ptr dmat { DMatrix::Load( - tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)}; - EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); +std::unique_ptr CreateSparsePageDMatrix(size_t n_entries, + std::string prefix) { + size_t n_columns = 3; + size_t n_rows = n_entries / n_columns; + ArrayIterForTest iter(0, n_rows, n_columns, 2); + + std::unique_ptr dmat{DMatrix::Create( + static_cast(&iter), iter.Proxy(), Reset, Next, + std::numeric_limits::quiet_NaN(), 1, prefix)}; + auto row_page_path = + data::MakeId(prefix, + dynamic_cast(dmat.get())) + + ".row.page"; + EXPECT_TRUE(FileExists(row_page_path)) << row_page_path; // Loop over the batches and count the records int64_t batch_count = 0; @@ -368,7 +389,6 @@ std::unique_ptr CreateSparsePageDMatrix( return dmat; } - std::unique_ptr CreateSparsePageDMatrixWithRC( size_t n_rows, size_t n_cols, size_t page_size, bool deterministic, const dmlc::TemporaryDirectory& tempdir) { @@ -432,7 +452,7 @@ std::unique_ptr CreateSparsePageDMatrixWithRC( uri += "#" + tmp_file + ".cache"; } std::unique_ptr dmat( - DMatrix::Load(uri, true, false, "auto", page_size)); + DMatrix::Load(uri, true, false, "auto")); return dmat; } @@ -481,6 +501,28 @@ std::unique_ptr CreateTrainedGBM( return gbm; } +ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols, + size_t batches) : rows_{rows}, cols_{cols}, n_batches_{batches} { + XGProxyDMatrixCreate(&proxy_); + rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity}); + std::tie(batches_, interface_) = + rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); +} + +ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); } + +int ArrayIterForTest::Next() { + if (iter_ == n_batches_) { + return 0; + } + XGProxyDMatrixSetDataDense(proxy_, batches_[iter_].c_str()); + iter_++; + return 1; +} + +size_t constexpr ArrayIterForTest::kRows; +size_t constexpr ArrayIterForTest::kCols; + void DMatrixToCSR(DMatrix *dmat, std::vector *p_data, std::vector *p_row_ptr, std::vector *p_cids) { diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu index 60501a67a..10c455270 100644 --- a/tests/cpp/helpers.cu +++ b/tests/cpp/helpers.cu @@ -8,16 +8,16 @@ namespace xgboost { CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches) - : rows_{rows}, cols_{cols}, n_batches_{batches} { - XGProxyDMatrixCreate(&proxy_); - rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity}); + : ArrayIterForTest{sparsity, rows, cols, batches} { rng_->Device(0); std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); this->Reset(); } -CudaArrayIterForTest::~CudaArrayIterForTest() { XGDMatrixFree(proxy_); } +size_t constexpr CudaArrayIterForTest::kRows; +size_t constexpr CudaArrayIterForTest::kCols; +size_t constexpr CudaArrayIterForTest::kBatches; int CudaArrayIterForTest::Next() { if (iter_ == n_batches_) { @@ -28,8 +28,6 @@ int CudaArrayIterForTest::Next() { return 1; } -size_t constexpr CudaArrayIterForTest::kRows; -size_t constexpr CudaArrayIterForTest::kCols; std::shared_ptr RandomDataGenerator::GenerateDeviceDMatrix(bool with_label, bool float_label, diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 7df5ddb2d..f5f88aff1 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -55,7 +55,9 @@ int64_t GetFileSize(const std::string& filename); void CreateSimpleTestData(const std::string& filename); -void CreateBigTestData(const std::string& filename, size_t n_entries); +// Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's +// 0-based indexing. +void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true); void CheckObjFunction(std::unique_ptr const& obj, std::vector preds, @@ -300,8 +302,7 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) { std::shared_ptr GetDMatrixFromData(const std::vector &x, int num_rows, int num_columns); -std::unique_ptr CreateSparsePageDMatrix( - size_t n_entries, size_t page_size, std::string tmp_file); +std::unique_ptr CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache"); /** * \fn std::unique_ptr CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols, @@ -356,7 +357,8 @@ inline HostDeviceVector GenerateRandomGradients(const size_t n_row typedef void *DMatrixHandle; // NOLINT(*); -class CudaArrayIterForTest { +class ArrayIterForTest { + protected: HostDeviceVector data_; size_t iter_ {0}; DMatrixHandle proxy_; @@ -373,20 +375,32 @@ class CudaArrayIterForTest { size_t static constexpr kBatches { 100 }; size_t static constexpr kCols { 13 }; - explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows, - size_t cols = kCols, size_t batches = kBatches); - ~CudaArrayIterForTest(); - std::string AsArray() const { return interface_; } - int Next(); - void Reset() { + virtual int Next(); + virtual void Reset() { iter_ = 0; } size_t Iter() const { return iter_; } auto Proxy() -> decltype(proxy_) { return proxy_; } + + explicit ArrayIterForTest(float sparsity, size_t rows = kRows, + size_t cols = kCols, size_t batches = kBatches); + virtual ~ArrayIterForTest(); +}; + +class CudaArrayIterForTest : public ArrayIterForTest { + public: + size_t static constexpr kRows{1000}; + size_t static constexpr kBatches{100}; + size_t static constexpr kCols{13}; + + explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows, + size_t cols = kCols, size_t batches = kBatches); + int Next() override; + ~CudaArrayIterForTest() override = default; }; void DMatrixToCSR(DMatrix *dmat, std::vector *p_data, @@ -396,11 +410,11 @@ void DMatrixToCSR(DMatrix *dmat, std::vector *p_data, typedef void *DataIterHandle; // NOLINT(*) inline void Reset(DataIterHandle self) { - static_cast(self)->Reset(); + static_cast(self)->Reset(); } inline int Next(DataIterHandle self) { - return static_cast(self)->Next(); + return static_cast(self)->Next(); } class RMMAllocator; diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 92dca5503..f5f49f8eb 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -92,13 +92,10 @@ TEST(CpuPredictor, IterationRange) { } TEST(CpuPredictor, ExternalMemory) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; - size_t constexpr kPageSize = 64, kEntriesPerCol = 3; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; - std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename); + std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries); auto lparam = CreateEmptyGenericParam(GPUIDX); std::unique_ptr cpu_predictor = diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 722d24299..24e7f36ed 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -102,13 +102,10 @@ TEST(GPUPredictor, ExternalMemoryTest) { gbm::GBTreeModel model = CreateTestModel(¶m, n_classes); std::vector> dmats; - dmlc::TemporaryDirectory tmpdir; - std::string file0 = tmpdir.path + "/big_0.libsvm"; - std::string file1 = tmpdir.path + "/big_1.libsvm"; - std::string file2 = tmpdir.path + "/big_2.libsvm"; - dmats.push_back(CreateSparsePageDMatrix(400, 64UL, file0)); - dmats.push_back(CreateSparsePageDMatrix(800, 128UL, file1)); - dmats.push_back(CreateSparsePageDMatrix(8000, 1024UL, file2)); + + dmats.push_back(CreateSparsePageDMatrix(400)); + dmats.push_back(CreateSparsePageDMatrix(800)); + dmats.push_back(CreateSparsePageDMatrix(8000)); for (const auto& dmat: dmats) { dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5); diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index ce910efed..4a0b499ff 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -98,8 +98,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT const std::string tmp_file = tempdir.path + "/big.libsvm"; CreateBigTestData(tmp_file, 50000); std::shared_ptr dmat(xgboost::DMatrix::Load( - tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 100)); - EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); + tmp_file + "#" + tmp_file + ".cache", true, false, "auto")); EXPECT_FALSE(dmat->SingleColBlock()); size_t num_row = dmat->Info().num_row_; std::vector labels(num_row); diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index ca8debdd1..5dc2c4982 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -27,7 +27,7 @@ void VerifySampling(size_t page_size, } gpair.SetDevice(0); - BatchParam param{0, 256, page_size}; + BatchParam param{0, 256}; auto page = (*dmat->GetBatches(param).begin()).Impl(); if (page_size != 0) { EXPECT_NE(page->n_rows, kRows); @@ -82,7 +82,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) { auto gpair = GenerateRandomGradients(kRows); gpair.SetDevice(0); - BatchParam param{0, 256, kPageSize}; + BatchParam param{0, 256}; auto page = (*dmat->GetBatches(param).begin()).Impl(); EXPECT_NE(page->n_rows, kRows); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 4ff7ec106..5c6599657 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -15,7 +15,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) { float sparsity = is_dense ? 0.0f : 0.5f; auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(); - BatchParam batch_param{0, static_cast(kBins), 0}; + BatchParam batch_param{0, static_cast(kBins)}; for (auto const& batch : matrix->GetBatches(batch_param)) { auto* page = batch.Impl(); @@ -116,7 +116,7 @@ void TestGPUHistogramCategorical(size_t num_categories) { auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories); auto cat_m = GetDMatrixFromData(x, kRows, 1); cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); - BatchParam batch_param{0, static_cast(kBins), 0}; + BatchParam batch_param{0, static_cast(kBins)}; tree::RowPartitioner row_partitioner(0, kRows); auto ridx = row_partitioner.GetRows(0); dh::device_vector cat_hist(num_categories); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 561377538..591dc43d2 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -152,7 +152,6 @@ TEST(GpuHist, ApplySplit) { BatchParam bparam; bparam.gpu_id = 0; bparam.max_bin = 3; - bparam.gpu_page_size = 0; for (auto& ellpack : m->GetBatches(bparam)){ auto impl = ellpack.Impl(); @@ -291,9 +290,13 @@ void TestHistogramIndexImpl() { // Extract the device maker from the histogram makers and from that its compressed // histogram index const auto &maker = hist_maker.maker; + auto grad = GenerateRandomGradients(kNRows); + grad.SetDevice(0); + maker->Reset(&grad, hist_maker_dmat.get(), kNCols); std::vector h_gidx_buffer(maker->page->gidx_buffer.HostVector()); const auto &maker_ext = hist_maker_ext.maker; + maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols); std::vector h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector()); ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins()); @@ -365,7 +368,7 @@ void UpdateTree(HostDeviceVector* gpair, DMatrix* dmat, // Loop over the batches and count the records int64_t batch_count = 0; int64_t row_count = 0; - for (const auto& batch : dmat->GetBatches({0, max_bin, gpu_page_size})) { + for (const auto& batch : dmat->GetBatches({0, max_bin})) { EXPECT_LT(batch.Size(), dmat->Info().num_row_); batch_count++; row_count += batch.Size(); @@ -386,7 +389,6 @@ void UpdateTree(HostDeviceVector* gpair, DMatrix* dmat, tree::GPUHistMakerSpecialised hist_maker; GenericParameter generic_param(CreateEmptyGenericParam(0)); - generic_param.gpu_page_size = gpu_page_size; hist_maker.Configure(args, &generic_param); hist_maker.Update(gpair, dmat, {tree});