Rewrite sparse dmatrix using callbacks. (#7092)
- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves. - Remove use of threaded iterator and IO queue. - Remove `page_size`. - Make sure the number of pages in memory is bounded. - Make sure the cache can not be violated. - Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
parent
2f524e9f41
commit
bd1f3a38f0
@ -37,18 +37,14 @@
|
|||||||
#include "../src/data/simple_dmatrix.cc"
|
#include "../src/data/simple_dmatrix.cc"
|
||||||
#include "../src/data/sparse_page_raw_format.cc"
|
#include "../src/data/sparse_page_raw_format.cc"
|
||||||
#include "../src/data/ellpack_page.cc"
|
#include "../src/data/ellpack_page.cc"
|
||||||
#include "../src/data/ellpack_page_source.cc"
|
|
||||||
#include "../src/data/gradient_index.cc"
|
#include "../src/data/gradient_index.cc"
|
||||||
|
#include "../src/data/sparse_page_dmatrix.cc"
|
||||||
|
#include "../src/data/proxy_dmatrix.cc"
|
||||||
|
|
||||||
// prediction
|
// prediction
|
||||||
#include "../src/predictor/predictor.cc"
|
#include "../src/predictor/predictor.cc"
|
||||||
#include "../src/predictor/cpu_predictor.cc"
|
#include "../src/predictor/cpu_predictor.cc"
|
||||||
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
|
||||||
#include "../src/data/sparse_page_dmatrix.cc"
|
|
||||||
#include "../src/data/sparse_page_source.cc"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// trees
|
// trees
|
||||||
#include "../src/tree/param.cc"
|
#include "../src/tree/param.cc"
|
||||||
#include "../src/tree/tree_model.cc"
|
#include "../src/tree/tree_model.cc"
|
||||||
|
|||||||
@ -223,19 +223,31 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
|
|||||||
* - XGBCallbackDataIterNext
|
* - XGBCallbackDataIterNext
|
||||||
* - XGDMatrixCreateFromDataIter
|
* - XGDMatrixCreateFromDataIter
|
||||||
*
|
*
|
||||||
* Another set is used by Quantile based DMatrix (used by hist algorithm) for reducing
|
* Another set is used by external data iterator. It accept foreign data iterators as
|
||||||
* memory usage. Currently only GPU implementation is available. It accept foreign data
|
* callbacks. There are 2 different senarios where users might want to pass in callbacks
|
||||||
* iterators as callbacks and works similar to external memory. For GPU Hist, the data is
|
* instead of raw data. First it's the Quantile DMatrix used by GPU Hist. For this case,
|
||||||
* first compressed by quantile sketching then merged. This is particular useful for
|
* the data is first compressed by quantile sketching then merged. This is particular
|
||||||
* distributed setting as it eliminates 2 copies of data. 1 by a `concat` from external
|
* useful for distributed setting as it eliminates 2 copies of data. 1 by a `concat` from
|
||||||
* library to make the data into a blob for normal DMatrix initialization, another by the
|
* external library to make the data into a blob for normal DMatrix initialization,
|
||||||
* internal CSR copy of DMatrix. Related functions are:
|
* another by the internal CSR copy of DMatrix. The second use case is external memory
|
||||||
|
* support where users can pass a custom data iterator into XGBoost for loading data in
|
||||||
|
* batches. There are short notes on each of the use case in respected DMatrix factory
|
||||||
|
* function.
|
||||||
*
|
*
|
||||||
|
* Related functions are:
|
||||||
|
*
|
||||||
|
* # Factory functions
|
||||||
|
* - `XGDMatrixCreateFromCallback` for external memory
|
||||||
|
* - `XGDeviceQuantileDMatrixCreateFromCallback` for quantile DMatrix
|
||||||
|
*
|
||||||
|
* # Proxy that callers can use to pass data to XGBoost
|
||||||
* - XGProxyDMatrixCreate
|
* - XGProxyDMatrixCreate
|
||||||
* - XGDMatrixCallbackNext
|
* - XGDMatrixCallbackNext
|
||||||
* - DataIterResetCallback
|
* - DataIterResetCallback
|
||||||
* - XGProxyDMatrixSetDataCudaArrayInterface
|
* - XGProxyDMatrixSetDataCudaArrayInterface
|
||||||
* - XGProxyDMatrixSetDataCudaColumnar
|
* - XGProxyDMatrixSetDataCudaColumnar
|
||||||
|
* - XGProxyDMatrixSetDataDense
|
||||||
|
* - XGProxyDMatrixSetDataCSR
|
||||||
* - ... (data setters)
|
* - ... (data setters)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -308,17 +320,9 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
|
|||||||
const char* cache_info,
|
const char* cache_info,
|
||||||
DMatrixHandle *out);
|
DMatrixHandle *out);
|
||||||
|
|
||||||
/* == Second set of callback functions, used by constructing Quantile based DMatrix. ===
|
/**
|
||||||
*
|
* Second set of callback functions, used by constructing Quantile DMatrix or external
|
||||||
* Short note for how to use the second set of callback for GPU Hist tree method.
|
* memory DMatrix using custom iterator.
|
||||||
*
|
|
||||||
* Step 0: Define a data iterator with 2 methods `reset`, and `next`.
|
|
||||||
* Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
|
|
||||||
* Step 2: Pass the iterator handle, proxy handle and 2 methods into
|
|
||||||
* `XGDeviceQuantileDMatrixCreateFromCallback`.
|
|
||||||
* Step 3: Call appropriate data setters in `next` functions.
|
|
||||||
*
|
|
||||||
* See test_iterative_device_dmatrix.cu or Python interface for examples.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -344,8 +348,53 @@ XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter); // NOLINT(
|
|||||||
*/
|
*/
|
||||||
XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*)
|
XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*)
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Create a device DMatrix with data iterator.
|
* \brief Create an external memory DMatrix with data iterator.
|
||||||
|
*
|
||||||
|
* Short note for how to use second set of callback for external memory data support:
|
||||||
|
*
|
||||||
|
* - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
|
||||||
|
* - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
|
||||||
|
* - Step 2: Pass the iterator handle, proxy handle and 2 methods into
|
||||||
|
* `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
|
||||||
|
* - Step 3: Call appropriate data setters in `next` functions.
|
||||||
|
*
|
||||||
|
* For example usage see demo/c-api/external-memory
|
||||||
|
*
|
||||||
|
* \param iter A handle to external data iterator.
|
||||||
|
* \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
|
||||||
|
* \param reset Callback function resetting the iterator state.
|
||||||
|
* \param next Callback function yielding the next batch of data.
|
||||||
|
* \param c_json_config JSON encoded parameters for DMatrix construction. Accepted fields are:
|
||||||
|
*
|
||||||
|
* - missing: Which value to represent missing value
|
||||||
|
* - cache_prefix: The path of cache file, caller must initialize all the directories in this path.
|
||||||
|
* - nthread (optional): Number of threads used for initializing DMatrix.
|
||||||
|
*
|
||||||
|
* \param out The created external memory DMatrix
|
||||||
|
*
|
||||||
|
* \return 0 when success, -1 when failure happens
|
||||||
|
*/
|
||||||
|
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
|
||||||
|
DMatrixHandle proxy,
|
||||||
|
DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next,
|
||||||
|
char const* c_json_config,
|
||||||
|
DMatrixHandle *out);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Create a Quantile DMatrix with data iterator.
|
||||||
|
*
|
||||||
|
* Short note for how to use the second set of callback for GPU Hist tree method:
|
||||||
|
*
|
||||||
|
* - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
|
||||||
|
* - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
|
||||||
|
* - Step 2: Pass the iterator handle, proxy handle and 2 methods into
|
||||||
|
* `XGDeviceQuantileDMatrixCreateFromCallback`.
|
||||||
|
* - Step 3: Call appropriate data setters in `next` functions.
|
||||||
|
*
|
||||||
|
* See test_iterative_device_dmatrix.cu or Python interface for examples.
|
||||||
*
|
*
|
||||||
* \param iter A handle to external data iterator.
|
* \param iter A handle to external data iterator.
|
||||||
* \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
|
* \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
|
||||||
@ -362,6 +411,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
|
|||||||
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
|
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
|
||||||
XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
|
XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
|
||||||
DMatrixHandle *out);
|
DMatrixHandle *out);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Set data on a DMatrix proxy.
|
* \brief Set data on a DMatrix proxy.
|
||||||
*
|
*
|
||||||
@ -387,6 +437,33 @@ XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
|
|||||||
XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle,
|
XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle,
|
||||||
const char *c_interface_str);
|
const char *c_interface_str);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Set data on a DMatrix proxy.
|
||||||
|
*
|
||||||
|
* \param handle A DMatrix proxy created by XGProxyDMatrixCreate
|
||||||
|
* \param c_interface_str Null terminated JSON document string representation of array
|
||||||
|
* interface.
|
||||||
|
*
|
||||||
|
* \return 0 when success, -1 when failure happens
|
||||||
|
*/
|
||||||
|
XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle,
|
||||||
|
char const *c_interface_str);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Set data on a DMatrix proxy.
|
||||||
|
*
|
||||||
|
* \param handle A DMatrix proxy created by XGProxyDMatrixCreate
|
||||||
|
* \param indptr JSON encoded __array_interface__ to row pointer in CSR.
|
||||||
|
* \param indices JSON encoded __array_interface__ to column indices in CSR.
|
||||||
|
* \param values JSON encoded __array_interface__ to values in CSR..
|
||||||
|
*
|
||||||
|
* \return 0 when success, -1 when failure happens
|
||||||
|
*/
|
||||||
|
XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
|
||||||
|
char const *indices, char const *data,
|
||||||
|
bst_ulong ncol);
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ==========================- End data callback APIs ==========================
|
* ==========================- End data callback APIs ==========================
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -171,9 +171,12 @@ class MetaInfo {
|
|||||||
* \param that The other MetaInfo object.
|
* \param that The other MetaInfo object.
|
||||||
*
|
*
|
||||||
* \param accumulate_rows Whether rows need to be accumulated in this function. If
|
* \param accumulate_rows Whether rows need to be accumulated in this function. If
|
||||||
* client code knows number of rows in advance, set this parameter to false.
|
* client code knows number of rows in advance, set this
|
||||||
|
* parameter to false.
|
||||||
|
* \param check_column Whether the extend method should check the consistency of
|
||||||
|
* columns.
|
||||||
*/
|
*/
|
||||||
void Extend(MetaInfo const& that, bool accumulate_rows);
|
void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/*! \brief argsort of labels */
|
/*! \brief argsort of labels */
|
||||||
@ -211,14 +214,12 @@ struct BatchParam {
|
|||||||
int gpu_id;
|
int gpu_id;
|
||||||
/*! \brief Maximum number of bins per feature for histograms. */
|
/*! \brief Maximum number of bins per feature for histograms. */
|
||||||
int max_bin{0};
|
int max_bin{0};
|
||||||
/*! \brief Page size for external memory mode. */
|
|
||||||
size_t gpu_page_size;
|
|
||||||
BatchParam() = default;
|
BatchParam() = default;
|
||||||
BatchParam(int32_t device, int32_t max_bin, size_t gpu_page_size = 0)
|
BatchParam(int32_t device, int32_t max_bin)
|
||||||
: gpu_id{device}, max_bin{max_bin}, gpu_page_size{gpu_page_size} {}
|
: gpu_id{device}, max_bin{max_bin} {}
|
||||||
inline bool operator!=(const BatchParam& other) const {
|
|
||||||
return gpu_id != other.gpu_id || max_bin != other.max_bin ||
|
bool operator!=(const BatchParam& other) const {
|
||||||
gpu_page_size != other.gpu_page_size;
|
return gpu_id != other.gpu_id || max_bin != other.max_bin;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -390,11 +391,12 @@ class GHistIndexMatrix;
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
class BatchIteratorImpl {
|
class BatchIteratorImpl {
|
||||||
public:
|
public:
|
||||||
|
using iterator_category = std::forward_iterator_tag; // NOLINT
|
||||||
virtual ~BatchIteratorImpl() = default;
|
virtual ~BatchIteratorImpl() = default;
|
||||||
virtual T& operator*() = 0;
|
|
||||||
virtual const T& operator*() const = 0;
|
virtual const T& operator*() const = 0;
|
||||||
virtual void operator++() = 0;
|
virtual BatchIteratorImpl& operator++() = 0;
|
||||||
virtual bool AtEnd() const = 0;
|
virtual bool AtEnd() const = 0;
|
||||||
|
virtual std::shared_ptr<T const> Page() const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -402,15 +404,12 @@ class BatchIterator {
|
|||||||
public:
|
public:
|
||||||
using iterator_category = std::forward_iterator_tag; // NOLINT
|
using iterator_category = std::forward_iterator_tag; // NOLINT
|
||||||
explicit BatchIterator(BatchIteratorImpl<T>* impl) { impl_.reset(impl); }
|
explicit BatchIterator(BatchIteratorImpl<T>* impl) { impl_.reset(impl); }
|
||||||
|
explicit BatchIterator(std::shared_ptr<BatchIteratorImpl<T>> impl) { impl_ = impl; }
|
||||||
|
|
||||||
void operator++() {
|
BatchIterator &operator++() {
|
||||||
CHECK(impl_ != nullptr);
|
CHECK(impl_ != nullptr);
|
||||||
++(*impl_);
|
++(*impl_);
|
||||||
}
|
return *this;
|
||||||
|
|
||||||
T& operator*() {
|
|
||||||
CHECK(impl_ != nullptr);
|
|
||||||
return *(*impl_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const T& operator*() const {
|
const T& operator*() const {
|
||||||
@ -428,6 +427,10 @@ class BatchIterator {
|
|||||||
return impl_->AtEnd();
|
return impl_->AtEnd();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<T const> Page() const {
|
||||||
|
return impl_->Page();
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<BatchIteratorImpl<T>> impl_;
|
std::shared_ptr<BatchIteratorImpl<T>> impl_;
|
||||||
};
|
};
|
||||||
@ -499,8 +502,7 @@ class DMatrix {
|
|||||||
static DMatrix* Load(const std::string& uri,
|
static DMatrix* Load(const std::string& uri,
|
||||||
bool silent,
|
bool silent,
|
||||||
bool load_row_split,
|
bool load_row_split,
|
||||||
const std::string& file_format = "auto",
|
const std::string& file_format = "auto");
|
||||||
size_t page_size = kPageSize);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Creates a new DMatrix from an external data adapter.
|
* \brief Creates a new DMatrix from an external data adapter.
|
||||||
@ -516,8 +518,7 @@ class DMatrix {
|
|||||||
*/
|
*/
|
||||||
template <typename AdapterT>
|
template <typename AdapterT>
|
||||||
static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
|
static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix = "",
|
const std::string& cache_prefix = "");
|
||||||
size_t page_size = kPageSize);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Create a new Quantile based DMatrix used for histogram based algorithm.
|
* \brief Create a new Quantile based DMatrix used for histogram based algorithm.
|
||||||
@ -545,6 +546,31 @@ class DMatrix {
|
|||||||
int nthread,
|
int nthread,
|
||||||
int max_bin);
|
int max_bin);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Create an external memory DMatrix with callbacks.
|
||||||
|
*
|
||||||
|
* \tparam DataIterHandle External iterator type, defined in C API.
|
||||||
|
* \tparam DMatrixHandle DMatrix handle, defined in C API.
|
||||||
|
* \tparam DataIterResetCallback Callback for reset, prototype defined in C API.
|
||||||
|
* \tparam XGDMatrixCallbackNext Callback for next, prototype defined in C API.
|
||||||
|
*
|
||||||
|
* \param iter External data iterator
|
||||||
|
* \param proxy A hanlde to ProxyDMatrix
|
||||||
|
* \param reset Callback for reset
|
||||||
|
* \param next Callback for next
|
||||||
|
* \param missing Value that should be treated as missing.
|
||||||
|
* \param nthread number of threads used for initialization.
|
||||||
|
* \param cache Prefix of cache file path.
|
||||||
|
*
|
||||||
|
* \return A created external memory DMatrix.
|
||||||
|
*/
|
||||||
|
template <typename DataIterHandle, typename DMatrixHandle,
|
||||||
|
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
|
||||||
|
static DMatrix *Create(DataIterHandle iter, DMatrixHandle proxy,
|
||||||
|
DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next, float missing,
|
||||||
|
int32_t nthread, std::string cache);
|
||||||
|
|
||||||
virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
|
virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
|
||||||
/*! \brief Number of rows per page in external memory. Approximately 100MB per page for
|
/*! \brief Number of rows per page in external memory. Approximately 100MB per page for
|
||||||
* dataset with 100 features. */
|
* dataset with 100 features. */
|
||||||
|
|||||||
@ -29,8 +29,6 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
|
|||||||
int gpu_id;
|
int gpu_id;
|
||||||
// fail when gpu_id is invalid
|
// fail when gpu_id is invalid
|
||||||
bool fail_on_invalid_gpu_id {false};
|
bool fail_on_invalid_gpu_id {false};
|
||||||
// gpu page size in external memory mode, 0 means using the default.
|
|
||||||
size_t gpu_page_size;
|
|
||||||
bool validate_parameters {false};
|
bool validate_parameters {false};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -66,10 +64,6 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
|
|||||||
DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
|
DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
|
||||||
.set_default(false)
|
.set_default(false)
|
||||||
.describe("Fail with error when gpu_id is invalid.");
|
.describe("Fail with error when gpu_id is invalid.");
|
||||||
DMLC_DECLARE_FIELD(gpu_page_size)
|
|
||||||
.set_default(0)
|
|
||||||
.set_lower_bound(0)
|
|
||||||
.describe("GPU page size when running in external memory mode.");
|
|
||||||
DMLC_DECLARE_FIELD(validate_parameters)
|
DMLC_DECLARE_FIELD(validate_parameters)
|
||||||
.set_default(false)
|
.set_default(false)
|
||||||
.describe("Enable checking whether parameters are used or not.");
|
.describe("Enable checking whether parameters are used or not.");
|
||||||
|
|||||||
@ -190,6 +190,35 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Create from data iterator
|
// Create from data iterator
|
||||||
|
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
|
||||||
|
DMatrixHandle proxy,
|
||||||
|
DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next,
|
||||||
|
char const* c_json_config,
|
||||||
|
DMatrixHandle *out) {
|
||||||
|
API_BEGIN();
|
||||||
|
auto config = Json::Load(StringView{c_json_config});
|
||||||
|
float missing = get<Number const>(config["missing"]);
|
||||||
|
std::string cache = get<String const>(config["cache_prefix"]);
|
||||||
|
int32_t n_threads = omp_get_max_threads();
|
||||||
|
if (!IsA<Null>(config["nthread"])) {
|
||||||
|
n_threads = get<Integer const>(config["nthread"]);
|
||||||
|
}
|
||||||
|
*out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
|
||||||
|
iter, proxy, reset, next, missing, n_threads, cache)};
|
||||||
|
API_END();
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
|
||||||
|
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next, float missing, int nthread,
|
||||||
|
int max_bin, DMatrixHandle *out) {
|
||||||
|
API_BEGIN();
|
||||||
|
*out = new std::shared_ptr<xgboost::DMatrix>{
|
||||||
|
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)};
|
||||||
|
API_END();
|
||||||
|
}
|
||||||
|
|
||||||
XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
|
XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
|
||||||
API_BEGIN();
|
API_BEGIN();
|
||||||
*out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
|
*out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
|
||||||
@ -221,15 +250,31 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle,
|
|||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
|
|
||||||
XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
|
XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle,
|
||||||
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
|
char const *c_interface_str) {
|
||||||
XGDMatrixCallbackNext *next, float missing, int nthread,
|
|
||||||
int max_bin, DMatrixHandle *out) {
|
|
||||||
API_BEGIN();
|
API_BEGIN();
|
||||||
*out = new std::shared_ptr<xgboost::DMatrix>{
|
CHECK_HANDLE();
|
||||||
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, nthread, max_bin)};
|
auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
|
||||||
|
CHECK(p_m);
|
||||||
|
auto m = static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
|
||||||
|
CHECK(m) << "Current DMatrix type does not support set data.";
|
||||||
|
m->SetArrayData(c_interface_str);
|
||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
|
||||||
|
char const *indices, char const *data,
|
||||||
|
xgboost::bst_ulong ncol) {
|
||||||
|
API_BEGIN();
|
||||||
|
CHECK_HANDLE();
|
||||||
|
auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
|
||||||
|
CHECK(p_m);
|
||||||
|
auto m = static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
|
||||||
|
CHECK(m) << "Current DMatrix type does not support set data.";
|
||||||
|
m->SetCSRData(indptr, indices, data, ncol, true);
|
||||||
|
API_END();
|
||||||
|
}
|
||||||
|
|
||||||
// End Create from data iterator
|
// End Create from data iterator
|
||||||
|
|
||||||
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
||||||
|
|||||||
@ -91,7 +91,6 @@ void PruneImpl(int device,
|
|||||||
}
|
}
|
||||||
|
|
||||||
float w = back.rmin - front.rmax;
|
float w = back.rmin - front.rmax;
|
||||||
assert(w != 0);
|
|
||||||
auto budget = static_cast<float>(d_out.size());
|
auto budget = static_cast<float>(d_out.size());
|
||||||
assert(budget != 0);
|
assert(budget != 0);
|
||||||
auto q = ((static_cast<float>(idx) * w) / (static_cast<float>(to) - 1.0f) + front.rmax);
|
auto q = ((static_cast<float>(idx) * w) / (static_cast<float>(to) - 1.0f) + front.rmax);
|
||||||
|
|||||||
143
src/data/data.cc
143
src/data/data.cc
@ -22,11 +22,10 @@
|
|||||||
#include "../common/threading_utils.h"
|
#include "../common/threading_utils.h"
|
||||||
#include "../data/adapter.h"
|
#include "../data/adapter.h"
|
||||||
#include "../data/iterative_device_dmatrix.h"
|
#include "../data/iterative_device_dmatrix.h"
|
||||||
|
#include "file_iterator.h"
|
||||||
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
|
||||||
#include "./sparse_page_source.h"
|
#include "./sparse_page_source.h"
|
||||||
#include "./sparse_page_dmatrix.h"
|
#include "./sparse_page_dmatrix.h"
|
||||||
#endif // DMLC_ENABLE_STD_THREAD
|
|
||||||
|
|
||||||
namespace dmlc {
|
namespace dmlc {
|
||||||
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
|
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
|
||||||
@ -500,13 +499,17 @@ void MetaInfo::GetFeatureInfo(const char *field,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) {
|
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_column) {
|
||||||
if (accumulate_rows) {
|
if (accumulate_rows) {
|
||||||
this->num_row_ += that.num_row_;
|
this->num_row_ += that.num_row_;
|
||||||
}
|
}
|
||||||
if (this->num_col_ != 0) {
|
if (this->num_col_ != 0) {
|
||||||
|
if (check_column) {
|
||||||
CHECK_EQ(this->num_col_, that.num_col_)
|
CHECK_EQ(this->num_col_, that.num_col_)
|
||||||
<< "Number of columns must be consistent across batches.";
|
<< "Number of columns must be consistent across batches.";
|
||||||
|
} else {
|
||||||
|
this->num_col_ = std::max(this->num_col_, that.num_col_);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
this->num_col_ = that.num_col_;
|
this->num_col_ = that.num_col_;
|
||||||
|
|
||||||
@ -630,11 +633,34 @@ DMatrix::~DMatrix() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DMatrix *TryLoadBinary(std::string fname, bool silent) {
|
||||||
|
int magic;
|
||||||
|
std::unique_ptr<dmlc::Stream> fi(
|
||||||
|
dmlc::Stream::Create(fname.c_str(), "r", true));
|
||||||
|
if (fi != nullptr) {
|
||||||
|
common::PeekableInStream is(fi.get());
|
||||||
|
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
|
||||||
|
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||||
|
dmlc::ByteSwap(&magic, sizeof(magic), 1);
|
||||||
|
}
|
||||||
|
if (magic == data::SimpleDMatrix::kMagic) {
|
||||||
|
DMatrix *dmat = new data::SimpleDMatrix(&is);
|
||||||
|
if (!silent) {
|
||||||
|
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_
|
||||||
|
<< " matrix with " << dmat->Info().num_nonzero_
|
||||||
|
<< " entries loaded from " << fname;
|
||||||
|
}
|
||||||
|
return dmat;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
DMatrix* DMatrix::Load(const std::string& uri,
|
DMatrix* DMatrix::Load(const std::string& uri,
|
||||||
bool silent,
|
bool silent,
|
||||||
bool load_row_split,
|
bool load_row_split,
|
||||||
const std::string& file_format,
|
const std::string& file_format) {
|
||||||
const size_t page_size) {
|
|
||||||
std::string fname, cache_file;
|
std::string fname, cache_file;
|
||||||
size_t dlm_pos = uri.find('#');
|
size_t dlm_pos = uri.find('#');
|
||||||
if (dlm_pos != std::string::npos) {
|
if (dlm_pos != std::string::npos) {
|
||||||
@ -682,35 +708,34 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
|||||||
|
|
||||||
// legacy handling of binary data loading
|
// legacy handling of binary data loading
|
||||||
if (file_format == "auto" && npart == 1) {
|
if (file_format == "auto" && npart == 1) {
|
||||||
int magic;
|
DMatrix *loaded = TryLoadBinary(fname, silent);
|
||||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
|
if (loaded) {
|
||||||
if (fi != nullptr) {
|
return loaded;
|
||||||
common::PeekableInStream is(fi.get());
|
|
||||||
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
|
|
||||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
|
||||||
dmlc::ByteSwap(&magic, sizeof(magic), 1);
|
|
||||||
}
|
|
||||||
if (magic == data::SimpleDMatrix::kMagic) {
|
|
||||||
DMatrix* dmat = new data::SimpleDMatrix(&is);
|
|
||||||
if (!silent) {
|
|
||||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
|
||||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
|
||||||
}
|
|
||||||
return dmat;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<dmlc::Parser<uint32_t> > parser(
|
|
||||||
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
|
|
||||||
data::FileAdapter adapter(parser.get());
|
|
||||||
DMatrix* dmat {nullptr};
|
DMatrix* dmat {nullptr};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1,
|
if (cache_file.empty()) {
|
||||||
cache_file, page_size);
|
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||||
} catch (dmlc::Error& e) {
|
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart,
|
||||||
|
file_format.c_str()));
|
||||||
|
data::FileAdapter adapter(parser.get());
|
||||||
|
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||||
|
1, cache_file);
|
||||||
|
} else {
|
||||||
|
data::FileIterator iter{fname, uint32_t(partid), uint32_t(npart),
|
||||||
|
file_format};
|
||||||
|
dmat = new data::SparsePageDMatrix{
|
||||||
|
&iter,
|
||||||
|
iter.Proxy(),
|
||||||
|
data::fileiter::Reset,
|
||||||
|
data::fileiter::Next,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(),
|
||||||
|
1,
|
||||||
|
cache_file};
|
||||||
|
}
|
||||||
|
} catch (dmlc::Error &e) {
|
||||||
std::vector<std::string> splited = common::Split(fname, '#');
|
std::vector<std::string> splited = common::Split(fname, '#');
|
||||||
std::vector<std::string> args = common::Split(splited.front(), '?');
|
std::vector<std::string> args = common::Split(splited.front(), '?');
|
||||||
std::string format {file_format};
|
std::string format {file_format};
|
||||||
@ -734,10 +759,6 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
|||||||
LOG(FATAL) << "Encountered parser error:\n" << e.what();
|
LOG(FATAL) << "Encountered parser error:\n" << e.what();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!silent) {
|
|
||||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
|
||||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
|
||||||
}
|
|
||||||
/* sync up number of features after matrix loaded.
|
/* sync up number of features after matrix loaded.
|
||||||
* partitioned data will fail the train/val validation check
|
* partitioned data will fail the train/val validation check
|
||||||
* since partitioned data not knowing the real number of features. */
|
* since partitioned data not knowing the real number of features. */
|
||||||
@ -769,12 +790,19 @@ DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
|
|||||||
XGDMatrixCallbackNext *next, float missing,
|
XGDMatrixCallbackNext *next, float missing,
|
||||||
int nthread,
|
int nthread,
|
||||||
int max_bin) {
|
int max_bin) {
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing,
|
||||||
return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing, nthread, max_bin);
|
nthread, max_bin);
|
||||||
#else
|
}
|
||||||
common::AssertGPUSupport();
|
|
||||||
return nullptr;
|
template <typename DataIterHandle, typename DMatrixHandle,
|
||||||
#endif
|
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
|
||||||
|
DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
|
||||||
|
DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next, float missing,
|
||||||
|
int32_t n_threads,
|
||||||
|
std::string cache) {
|
||||||
|
return new data::SparsePageDMatrix(iter, proxy, reset, next, missing, n_threads,
|
||||||
|
cache);
|
||||||
}
|
}
|
||||||
|
|
||||||
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
|
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
|
||||||
@ -783,49 +811,42 @@ template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
|
|||||||
XGDMatrixCallbackNext *next, float missing, int nthread,
|
XGDMatrixCallbackNext *next, float missing, int nthread,
|
||||||
int max_bin);
|
int max_bin);
|
||||||
|
|
||||||
|
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
|
||||||
|
DataIterResetCallback, XGDMatrixCallbackNext>(
|
||||||
|
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
|
||||||
|
|
||||||
template <typename AdapterT>
|
template <typename AdapterT>
|
||||||
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size) {
|
const std::string& cache_prefix) {
|
||||||
if (cache_prefix.length() == 0) {
|
|
||||||
// Data split mode is fixed to be row right now.
|
|
||||||
return new data::SimpleDMatrix(adapter, missing, nthread);
|
return new data::SimpleDMatrix(adapter, missing, nthread);
|
||||||
} else {
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
|
||||||
return new data::SparsePageDMatrix(adapter, missing, nthread, cache_prefix,
|
|
||||||
page_size);
|
|
||||||
#else
|
|
||||||
LOG(FATAL) << "External memory is not enabled in mingw";
|
|
||||||
return nullptr;
|
|
||||||
#endif // DMLC_ENABLE_STD_THREAD
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template DMatrix* DMatrix::Create<data::DenseAdapter>(
|
template DMatrix* DMatrix::Create<data::DenseAdapter>(
|
||||||
data::DenseAdapter* adapter, float missing, int nthread,
|
data::DenseAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::ArrayAdapter>(
|
template DMatrix* DMatrix::Create<data::ArrayAdapter>(
|
||||||
data::ArrayAdapter* adapter, float missing, int nthread,
|
data::ArrayAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::CSRAdapter>(
|
template DMatrix* DMatrix::Create<data::CSRAdapter>(
|
||||||
data::CSRAdapter* adapter, float missing, int nthread,
|
data::CSRAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::CSCAdapter>(
|
template DMatrix* DMatrix::Create<data::CSCAdapter>(
|
||||||
data::CSCAdapter* adapter, float missing, int nthread,
|
data::CSCAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::DataTableAdapter>(
|
template DMatrix* DMatrix::Create<data::DataTableAdapter>(
|
||||||
data::DataTableAdapter* adapter, float missing, int nthread,
|
data::DataTableAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::FileAdapter>(
|
template DMatrix* DMatrix::Create<data::FileAdapter>(
|
||||||
data::FileAdapter* adapter, float missing, int nthread,
|
data::FileAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(
|
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(
|
||||||
data::CSRArrayAdapter* adapter, float missing, int nthread,
|
data::CSRArrayAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix *
|
template DMatrix *
|
||||||
DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
|
DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
|
||||||
XGBoostBatchCSR> *adapter,
|
XGBoostBatchCSR> *adapter,
|
||||||
float missing, int nthread, const std::string &cache_prefix,
|
float missing, int nthread, const std::string &cache_prefix);
|
||||||
size_t page_size);
|
|
||||||
|
|
||||||
SparsePage SparsePage::GetTranspose(int num_columns) const {
|
SparsePage SparsePage::GetTranspose(int num_columns) const {
|
||||||
SparsePage transpose;
|
SparsePage transpose;
|
||||||
@ -1044,6 +1065,8 @@ SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthrea
|
|||||||
template uint64_t
|
template uint64_t
|
||||||
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
|
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
|
||||||
template uint64_t
|
template uint64_t
|
||||||
|
SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, int nthread);
|
||||||
|
template uint64_t
|
||||||
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
|
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
|
||||||
template uint64_t
|
template uint64_t
|
||||||
SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread);
|
SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread);
|
||||||
|
|||||||
@ -167,7 +167,7 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
|||||||
|
|
||||||
template <typename AdapterT>
|
template <typename AdapterT>
|
||||||
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size) {
|
const std::string& cache_prefix) {
|
||||||
CHECK_EQ(cache_prefix.size(), 0)
|
CHECK_EQ(cache_prefix.size(), 0)
|
||||||
<< "Device memory construction is not currently supported with external "
|
<< "Device memory construction is not currently supported with external "
|
||||||
"memory.";
|
"memory.";
|
||||||
@ -176,8 +176,8 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
|||||||
|
|
||||||
template DMatrix* DMatrix::Create<data::CudfAdapter>(
|
template DMatrix* DMatrix::Create<data::CudfAdapter>(
|
||||||
data::CudfAdapter* adapter, float missing, int nthread,
|
data::CudfAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
template DMatrix* DMatrix::Create<data::CupyAdapter>(
|
template DMatrix* DMatrix::Create<data::CupyAdapter>(
|
||||||
data::CupyAdapter* adapter, float missing, int nthread,
|
data::CupyAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix);
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -122,6 +122,7 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
|
|||||||
dmat->Info().feature_types.SetDevice(param.gpu_id);
|
dmat->Info().feature_types.SetDevice(param.gpu_id);
|
||||||
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
||||||
monitor_.Start("BinningCompression");
|
monitor_.Start("BinningCompression");
|
||||||
|
CHECK(dmat->SingleColBlock());
|
||||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||||
CreateHistIndices(param.gpu_id, batch, ft);
|
CreateHistIndices(param.gpu_id, batch, ft);
|
||||||
}
|
}
|
||||||
@ -301,9 +302,8 @@ struct CopyPage {
|
|||||||
// The number of elements to skip.
|
// The number of elements to skip.
|
||||||
size_t offset;
|
size_t offset;
|
||||||
|
|
||||||
CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
|
CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset)
|
||||||
: cbw{dst->NumSymbols()},
|
: cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||||
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
|
||||||
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||||
offset(offset) {}
|
offset(offset) {}
|
||||||
|
|
||||||
@ -314,7 +314,8 @@ struct CopyPage {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Copy the data from the given EllpackPage to the current page.
|
// Copy the data from the given EllpackPage to the current page.
|
||||||
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
|
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
|
||||||
|
size_t offset) {
|
||||||
monitor_.Start("Copy");
|
monitor_.Start("Copy");
|
||||||
size_t num_elements = page->n_rows * page->row_stride;
|
size_t num_elements = page->n_rows * page->row_stride;
|
||||||
CHECK_EQ(row_stride, page->row_stride);
|
CHECK_EQ(row_stride, page->row_stride);
|
||||||
@ -351,7 +352,7 @@ struct CompactPage {
|
|||||||
size_t base_rowid;
|
size_t base_rowid;
|
||||||
size_t row_stride;
|
size_t row_stride;
|
||||||
|
|
||||||
CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src,
|
CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src,
|
||||||
common::Span<size_t> row_indexes)
|
common::Span<size_t> row_indexes)
|
||||||
: cbw{dst->NumSymbols()},
|
: cbw{dst->NumSymbols()},
|
||||||
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||||
@ -374,7 +375,7 @@ struct CompactPage {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Compacts the data from the given EllpackPage into the current page.
|
// Compacts the data from the given EllpackPage into the current page.
|
||||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
|
void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
|
||||||
common::Span<size_t> row_indexes) {
|
common::Span<size_t> row_indexes) {
|
||||||
monitor_.Start("Compact");
|
monitor_.Start("Compact");
|
||||||
CHECK_EQ(row_stride, page->row_stride);
|
CHECK_EQ(row_stride, page->row_stride);
|
||||||
@ -459,7 +460,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
|||||||
gidx_buffer.DevicePointer(), row_ptrs.data().get(),
|
gidx_buffer.DevicePointer(), row_ptrs.data().get(),
|
||||||
entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
|
entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
|
||||||
device_accessor.feature_segments.data(), feature_types,
|
device_accessor.feature_segments.data(), feature_types,
|
||||||
row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride,
|
batch_row_begin, batch_nrows, row_stride,
|
||||||
null_gidx_value);
|
null_gidx_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -164,7 +164,7 @@ class EllpackPageImpl {
|
|||||||
* @param offset The number of elements to skip before copying.
|
* @param offset The number of elements to skip before copying.
|
||||||
* @returns The number of elements copied.
|
* @returns The number of elements copied.
|
||||||
*/
|
*/
|
||||||
size_t Copy(int device, EllpackPageImpl* page, size_t offset);
|
size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
|
||||||
|
|
||||||
/*! \brief Compact the given ELLPACK page into the current page.
|
/*! \brief Compact the given ELLPACK page into the current page.
|
||||||
*
|
*
|
||||||
@ -172,7 +172,7 @@ class EllpackPageImpl {
|
|||||||
* @param page The ELLPACK page to compact from.
|
* @param page The ELLPACK page to compact from.
|
||||||
* @param row_indexes Row indexes for the compacted page.
|
* @param row_indexes Row indexes for the compacted page.
|
||||||
*/
|
*/
|
||||||
void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);
|
void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
|
||||||
|
|
||||||
|
|
||||||
/*! \return Number of instances in the page. */
|
/*! \return Number of instances in the page. */
|
||||||
|
|||||||
@ -1,24 +0,0 @@
|
|||||||
/*!
|
|
||||||
* Copyright 2019 XGBoost contributors
|
|
||||||
*/
|
|
||||||
#ifndef XGBOOST_USE_CUDA
|
|
||||||
#include <dmlc/base.h>
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
|
||||||
|
|
||||||
#include "ellpack_page_source.h"
|
|
||||||
#include <xgboost/data.h>
|
|
||||||
namespace xgboost {
|
|
||||||
namespace data {
|
|
||||||
|
|
||||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
|
||||||
const std::string& cache_info,
|
|
||||||
const BatchParam& param) noexcept(false) {
|
|
||||||
LOG(FATAL)
|
|
||||||
<< "Internal Error: "
|
|
||||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace data
|
|
||||||
} // namespace xgboost
|
|
||||||
#endif // DMLC_ENABLE_STD_THREAD
|
|
||||||
#endif // XGBOOST_USE_CUDA
|
|
||||||
@ -1,89 +1,24 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2019 XGBoost contributors
|
* Copyright 2019-2021 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "../common/hist_util.cuh"
|
|
||||||
|
|
||||||
#include "ellpack_page.cuh"
|
#include "ellpack_page.cuh"
|
||||||
#include "ellpack_page_source.h"
|
#include "ellpack_page_source.h"
|
||||||
#include "sparse_page_source.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
void EllpackPageSource::Fetch() {
|
||||||
// Build the quantile sketch across the whole input data, then use the histogram cuts to compress
|
if (!this->ReadCache()) {
|
||||||
// each CSR page, and write the accumulated ELLPACK pages to disk.
|
auto const &csr = source_->Page();
|
||||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
this->page_.reset(new EllpackPage{});
|
||||||
const std::string& cache_info,
|
auto *impl = this->page_->Impl();
|
||||||
const BatchParam& param) noexcept(false) {
|
*impl = EllpackPageImpl(param_.gpu_id, *cuts_, *csr, is_dense_, row_stride_,
|
||||||
cache_info_ = ParseCacheInfo(cache_info, kPageType_);
|
feature_types_);
|
||||||
for (auto file : cache_info_.name_shards) {
|
page_->SetBaseRowId(csr->base_rowid);
|
||||||
CheckCacheFileExists(file);
|
this->WriteCache();
|
||||||
}
|
|
||||||
if (param.gpu_page_size > 0) {
|
|
||||||
page_size_ = param.gpu_page_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
monitor_.Init("ellpack_page_source");
|
|
||||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
|
||||||
|
|
||||||
monitor_.Start("Quantiles");
|
|
||||||
size_t row_stride = GetRowStride(dmat);
|
|
||||||
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
|
|
||||||
monitor_.Stop("Quantiles");
|
|
||||||
|
|
||||||
monitor_.Start("WriteEllpackPages");
|
|
||||||
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
|
|
||||||
monitor_.Stop("WriteEllpackPages");
|
|
||||||
|
|
||||||
external_prefetcher_.reset(
|
|
||||||
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
|
|
||||||
void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
|
|
||||||
const common::HistogramCuts& cuts,
|
|
||||||
const std::string& cache_info,
|
|
||||||
size_t row_stride) const {
|
|
||||||
auto cinfo = ParseCacheInfo(cache_info, kPageType_);
|
|
||||||
const size_t extra_buffer_capacity = 6;
|
|
||||||
SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
|
|
||||||
extra_buffer_capacity);
|
|
||||||
std::shared_ptr<EllpackPage> page;
|
|
||||||
SparsePage temp_host_page;
|
|
||||||
writer.Alloc(&page);
|
|
||||||
auto* impl = page->Impl();
|
|
||||||
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
|
||||||
|
|
||||||
size_t bytes_write = 0;
|
|
||||||
double tstart = dmlc::GetTime();
|
|
||||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
|
||||||
temp_host_page.Push(batch);
|
|
||||||
|
|
||||||
size_t mem_cost_bytes =
|
|
||||||
EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
|
|
||||||
if (mem_cost_bytes >= page_size_) {
|
|
||||||
bytes_write += mem_cost_bytes;
|
|
||||||
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
|
|
||||||
row_stride, ft);
|
|
||||||
writer.PushWrite(std::move(page));
|
|
||||||
writer.Alloc(&page);
|
|
||||||
impl = page->Impl();
|
|
||||||
temp_host_page.Clear();
|
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
|
||||||
LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
|
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
|
||||||
<< (bytes_write >> 20UL) << " written";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (temp_host_page.Size() != 0) {
|
|
||||||
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
|
|
||||||
row_stride, ft);
|
|
||||||
writer.PushWrite(std::move(page));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2019 by XGBoost Contributors
|
* Copyright 2019-2021 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
|
#ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
|
||||||
@ -8,57 +8,44 @@
|
|||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
#include "../common/timer.h"
|
#include "../common/common.h"
|
||||||
#include "../common/hist_util.h"
|
#include "../common/hist_util.h"
|
||||||
#include "sparse_page_source.h"
|
#include "sparse_page_source.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
/*!
|
class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
|
||||||
* \brief External memory data source for ELLPACK format.
|
bool is_dense_;
|
||||||
*
|
size_t row_stride_;
|
||||||
*/
|
BatchParam param_;
|
||||||
class EllpackPageSource {
|
common::Span<FeatureType const> feature_types_;
|
||||||
|
std::unique_ptr<common::HistogramCuts> cuts_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/*!
|
EllpackPageSource(
|
||||||
* \brief Create source from cache files the cache_prefix.
|
float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
|
||||||
* \param cache_prefix The prefix of cache we want to solve.
|
std::shared_ptr<Cache> cache, BatchParam param,
|
||||||
*/
|
std::unique_ptr<common::HistogramCuts> cuts, bool is_dense,
|
||||||
explicit EllpackPageSource(DMatrix* dmat,
|
size_t row_stride, common::Span<FeatureType const> feature_types,
|
||||||
const std::string& cache_info,
|
std::shared_ptr<SparsePageSource> source)
|
||||||
const BatchParam& param) noexcept(false);
|
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache),
|
||||||
|
is_dense_{is_dense}, row_stride_{row_stride}, param_{param},
|
||||||
BatchSet<EllpackPage> GetBatchSet() {
|
feature_types_{feature_types}, cuts_{std::move(cuts)} {
|
||||||
auto begin_iter = BatchIterator<EllpackPage>(
|
this->source_ = source;
|
||||||
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>,
|
this->Fetch();
|
||||||
EllpackPage>(external_prefetcher_.get()));
|
|
||||||
return BatchSet<EllpackPage>(begin_iter);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~EllpackPageSource() {
|
void Fetch() final;
|
||||||
external_prefetcher_.reset();
|
|
||||||
for (auto file : cache_info_.name_shards) {
|
|
||||||
TryDeleteCacheFile(file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
void WriteEllpackPages(int device, DMatrix* dmat,
|
|
||||||
const common::HistogramCuts& cuts,
|
|
||||||
const std::string& cache_info,
|
|
||||||
size_t row_stride) const;
|
|
||||||
|
|
||||||
/*! \brief The page type string for ELLPACK. */
|
|
||||||
const std::string kPageType_{".ellpack.page"};
|
|
||||||
|
|
||||||
size_t page_size_{DMatrix::kPageSize};
|
|
||||||
common::Monitor monitor_;
|
|
||||||
std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
|
|
||||||
CacheInfo cache_info_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(XGBOOST_USE_CUDA)
|
||||||
|
inline void EllpackPageSource::Fetch() {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
}
|
||||||
|
#endif // !defined(XGBOOST_USE_CUDA)
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
|
|||||||
115
src/data/file_iterator.h
Normal file
115
src/data/file_iterator.h
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2021 XGBoost contributors
|
||||||
|
*/
|
||||||
|
#ifndef XGBOOST_DATA_FILE_ITERATOR_H_
|
||||||
|
#define XGBOOST_DATA_FILE_ITERATOR_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "dmlc/data.h"
|
||||||
|
#include "xgboost/c_api.h"
|
||||||
|
#include "xgboost/json.h"
|
||||||
|
#include "array_interface.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace data {
|
||||||
|
/**
|
||||||
|
* An iterator for implementing external memory support with file inputs. Users of
|
||||||
|
* external memory are encouraged to define their own file parsers/loaders so this one is
|
||||||
|
* just here for compatibility with old versions of XGBoost and CLI interface.
|
||||||
|
*/
|
||||||
|
class FileIterator {
|
||||||
|
// uri of input file, encodes parameters about whether it's 1-based index etc. dmlc
|
||||||
|
// parser will decode these information.
|
||||||
|
std::string uri_;
|
||||||
|
// Equals to rank_id in distributed training, used to split file into parts for each
|
||||||
|
// worker.
|
||||||
|
uint32_t part_idx_;
|
||||||
|
// Equals to total number of workers.
|
||||||
|
uint32_t n_parts_;
|
||||||
|
// Format of the input file, like "libsvm".
|
||||||
|
std::string type_;
|
||||||
|
|
||||||
|
DMatrixHandle proxy_;
|
||||||
|
|
||||||
|
std::unique_ptr<dmlc::Parser<uint32_t>> parser_;
|
||||||
|
// Temporary reference to stage the data.
|
||||||
|
dmlc::RowBlock<uint32_t, float> row_block_;
|
||||||
|
// Storage for the array interface strings.
|
||||||
|
std::string indptr_;
|
||||||
|
std::string values_;
|
||||||
|
std::string indices_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
FileIterator(std::string uri, unsigned part_index, unsigned num_parts,
|
||||||
|
std::string type)
|
||||||
|
: uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts},
|
||||||
|
type_{std::move(type)} {
|
||||||
|
XGProxyDMatrixCreate(&proxy_);
|
||||||
|
}
|
||||||
|
~FileIterator() {
|
||||||
|
XGDMatrixFree(proxy_);
|
||||||
|
}
|
||||||
|
|
||||||
|
int Next() {
|
||||||
|
CHECK(parser_);
|
||||||
|
if (parser_->Next()) {
|
||||||
|
row_block_ = parser_->Value();
|
||||||
|
|
||||||
|
indptr_ = MakeArrayInterface(row_block_.offset, row_block_.size + 1);
|
||||||
|
values_ = MakeArrayInterface(row_block_.value,
|
||||||
|
row_block_.offset[row_block_.size]);
|
||||||
|
indices_ = MakeArrayInterface(row_block_.index,
|
||||||
|
row_block_.offset[row_block_.size]);
|
||||||
|
|
||||||
|
size_t n_columns = *std::max_element(
|
||||||
|
row_block_.index,
|
||||||
|
row_block_.index + row_block_.offset[row_block_.size]);
|
||||||
|
// dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
|
||||||
|
// this condition and just add 1 to n_columns
|
||||||
|
n_columns += 1;
|
||||||
|
|
||||||
|
XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(),
|
||||||
|
values_.c_str(), n_columns);
|
||||||
|
|
||||||
|
if (row_block_.label) {
|
||||||
|
XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1);
|
||||||
|
}
|
||||||
|
if (row_block_.qid) {
|
||||||
|
XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1);
|
||||||
|
}
|
||||||
|
if (row_block_.weight) {
|
||||||
|
XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1);
|
||||||
|
}
|
||||||
|
// Continue iteration
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
// Stop iteration
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto Proxy() -> decltype(proxy_) { return proxy_; }
|
||||||
|
|
||||||
|
void Reset() {
|
||||||
|
CHECK(!type_.empty());
|
||||||
|
parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_,
|
||||||
|
n_parts_, type_.c_str()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace fileiter {
|
||||||
|
inline void Reset(DataIterHandle self) {
|
||||||
|
static_cast<FileIterator*>(self)->Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int Next(DataIterHandle self) {
|
||||||
|
return static_cast<FileIterator*>(self)->Next();
|
||||||
|
}
|
||||||
|
} // namespace fileiter
|
||||||
|
} // namespace data
|
||||||
|
} // namespace xgboost
|
||||||
|
#endif // XGBOOST_DATA_FILE_ITERATOR_H_
|
||||||
@ -143,7 +143,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
|||||||
proxy->Info().num_row_ = num_rows();
|
proxy->Info().num_row_ = num_rows();
|
||||||
proxy->Info().num_col_ = cols;
|
proxy->Info().num_col_ = cols;
|
||||||
if (batches != 1) {
|
if (batches != 1) {
|
||||||
this->info_.Extend(std::move(proxy->Info()), false);
|
this->info_.Extend(std::move(proxy->Info()), false, true);
|
||||||
}
|
}
|
||||||
n_batches_for_verification++;
|
n_batches_for_verification++;
|
||||||
}
|
}
|
||||||
@ -163,7 +163,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
|||||||
BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
|
BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
|
||||||
CHECK(page_);
|
CHECK(page_);
|
||||||
auto begin_iter =
|
auto begin_iter =
|
||||||
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_.get()));
|
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_));
|
||||||
return BatchSet<EllpackPage>(begin_iter);
|
return BatchSet<EllpackPage>(begin_iter);
|
||||||
}
|
}
|
||||||
} // namespace data
|
} // namespace data
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
#include "xgboost/c_api.h"
|
#include "xgboost/c_api.h"
|
||||||
#include "proxy_dmatrix.h"
|
#include "proxy_dmatrix.h"
|
||||||
|
#include "simple_batch_iterator.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
@ -36,9 +37,10 @@ class IterativeDeviceDMatrix : public DMatrix {
|
|||||||
XGDMatrixCallbackNext *next, float missing,
|
XGDMatrixCallbackNext *next, float missing,
|
||||||
int nthread, int max_bin)
|
int nthread, int max_bin)
|
||||||
: proxy_{proxy}, reset_{reset}, next_{next} {
|
: proxy_{proxy}, reset_{reset}, next_{next} {
|
||||||
batch_param_ = BatchParam{0, max_bin, 0};
|
batch_param_ = BatchParam{0, max_bin};
|
||||||
this->Initialize(iter, missing, nthread);
|
this->Initialize(iter, missing, nthread);
|
||||||
}
|
}
|
||||||
|
~IterativeDeviceDMatrix() override = default;
|
||||||
|
|
||||||
bool EllpackExists() const override { return true; }
|
bool EllpackExists() const override { return true; }
|
||||||
bool SparsePageExists() const override { return false; }
|
bool SparsePageExists() const override { return false; }
|
||||||
@ -74,6 +76,18 @@ class IterativeDeviceDMatrix : public DMatrix {
|
|||||||
return info_;
|
return info_;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(XGBOOST_USE_CUDA)
|
||||||
|
inline void IterativeDeviceDMatrix::Initialize(DataIterHandle iter, float missing, int nthread) {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
}
|
||||||
|
inline BatchSet<EllpackPage> IterativeDeviceDMatrix::GetEllpackBatches(const BatchParam& param) {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
auto begin_iter =
|
||||||
|
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(page_));
|
||||||
|
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
|
||||||
|
}
|
||||||
|
#endif // !defined(XGBOOST_USE_CUDA)
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2020 XGBoost contributors
|
* Copyright 2020-2021 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
|
#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
|
||||||
#define XGBOOST_DATA_PROXY_DMATRIX_H_
|
#define XGBOOST_DATA_PROXY_DMATRIX_H_
|
||||||
|
|||||||
@ -1,10 +1,13 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2019 XGBoost contributors
|
* Copyright 2019-2021 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_
|
#ifndef XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_
|
||||||
#define XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_
|
#define XGBOOST_DATA_SIMPLE_BATCH_ITERATOR_H_
|
||||||
|
|
||||||
#include <xgboost/data.h>
|
#include <memory>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "xgboost/data.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
@ -12,20 +15,21 @@ namespace data {
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
class SimpleBatchIteratorImpl : public BatchIteratorImpl<T> {
|
class SimpleBatchIteratorImpl : public BatchIteratorImpl<T> {
|
||||||
public:
|
public:
|
||||||
explicit SimpleBatchIteratorImpl(T* page) : page_(page) {}
|
explicit SimpleBatchIteratorImpl(std::shared_ptr<T const> page) : page_(std::move(page)) {}
|
||||||
T& operator*() override {
|
|
||||||
CHECK(page_ != nullptr);
|
|
||||||
return *page_;
|
|
||||||
}
|
|
||||||
const T& operator*() const override {
|
const T& operator*() const override {
|
||||||
CHECK(page_ != nullptr);
|
CHECK(page_ != nullptr);
|
||||||
return *page_;
|
return *page_;
|
||||||
}
|
}
|
||||||
void operator++() override { page_ = nullptr; }
|
SimpleBatchIteratorImpl &operator++() override {
|
||||||
|
page_ = nullptr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
bool AtEnd() const override { return page_ == nullptr; }
|
bool AtEnd() const override { return page_ == nullptr; }
|
||||||
|
|
||||||
|
std::shared_ptr<T const> Page() const override { return page_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
T* page_{nullptr};
|
std::shared_ptr<T const> page_{nullptr};
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace data
|
} // namespace data
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2014~2020 by Contributors
|
* Copyright 2014~2021 by Contributors
|
||||||
* \file simple_dmatrix.cc
|
* \file simple_dmatrix.cc
|
||||||
* \brief the input data structure for gradient boosting
|
* \brief the input data structure for gradient boosting
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
@ -27,7 +27,7 @@ const MetaInfo& SimpleDMatrix::Info() const { return info_; }
|
|||||||
|
|
||||||
DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
|
DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
|
||||||
auto out = new SimpleDMatrix;
|
auto out = new SimpleDMatrix;
|
||||||
SparsePage& out_page = out->sparse_page_;
|
SparsePage& out_page = *out->sparse_page_;
|
||||||
for (auto const &page : this->GetBatches<SparsePage>()) {
|
for (auto const &page : this->GetBatches<SparsePage>()) {
|
||||||
auto batch = page.GetView();
|
auto batch = page.GetView();
|
||||||
auto& h_data = out_page.data.HostVector();
|
auto& h_data = out_page.data.HostVector();
|
||||||
@ -48,17 +48,17 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
|
|||||||
BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
|
BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
|
||||||
// since csr is the default data structure so `source_` is always available.
|
// since csr is the default data structure so `source_` is always available.
|
||||||
auto begin_iter = BatchIterator<SparsePage>(
|
auto begin_iter = BatchIterator<SparsePage>(
|
||||||
new SimpleBatchIteratorImpl<SparsePage>(&sparse_page_));
|
new SimpleBatchIteratorImpl<SparsePage>(sparse_page_));
|
||||||
return BatchSet<SparsePage>(begin_iter);
|
return BatchSet<SparsePage>(begin_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches() {
|
BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches() {
|
||||||
// column page doesn't exist, generate it
|
// column page doesn't exist, generate it
|
||||||
if (!column_page_) {
|
if (!column_page_) {
|
||||||
column_page_.reset(new CSCPage(sparse_page_.GetTranspose(info_.num_col_)));
|
column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_)));
|
||||||
}
|
}
|
||||||
auto begin_iter =
|
auto begin_iter =
|
||||||
BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_.get()));
|
BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
|
||||||
return BatchSet<CSCPage>(begin_iter);
|
return BatchSet<CSCPage>(begin_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,11 +66,11 @@ BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches() {
|
|||||||
// Sorted column page doesn't exist, generate it
|
// Sorted column page doesn't exist, generate it
|
||||||
if (!sorted_column_page_) {
|
if (!sorted_column_page_) {
|
||||||
sorted_column_page_.reset(
|
sorted_column_page_.reset(
|
||||||
new SortedCSCPage(sparse_page_.GetTranspose(info_.num_col_)));
|
new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_)));
|
||||||
sorted_column_page_->SortRows();
|
sorted_column_page_->SortRows();
|
||||||
}
|
}
|
||||||
auto begin_iter = BatchIterator<SortedCSCPage>(
|
auto begin_iter = BatchIterator<SortedCSCPage>(
|
||||||
new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_.get()));
|
new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_));
|
||||||
return BatchSet<SortedCSCPage>(begin_iter);
|
return BatchSet<SortedCSCPage>(begin_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(const BatchParam& param)
|
|||||||
batch_param_ = param;
|
batch_param_ = param;
|
||||||
}
|
}
|
||||||
auto begin_iter =
|
auto begin_iter =
|
||||||
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_.get()));
|
BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_));
|
||||||
return BatchSet<EllpackPage>(begin_iter);
|
return BatchSet<EllpackPage>(begin_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -100,7 +100,7 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(const BatchParam& par
|
|||||||
batch_param_ = param;
|
batch_param_ = param;
|
||||||
}
|
}
|
||||||
auto begin_iter = BatchIterator<GHistIndexMatrix>(
|
auto begin_iter = BatchIterator<GHistIndexMatrix>(
|
||||||
new SimpleBatchIteratorImpl<GHistIndexMatrix>(gradient_index_.get()));
|
new SimpleBatchIteratorImpl<GHistIndexMatrix>(gradient_index_));
|
||||||
return BatchSet<GHistIndexMatrix>(begin_iter);
|
return BatchSet<GHistIndexMatrix>(begin_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,8 +110,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
|||||||
uint64_t default_max = std::numeric_limits<uint64_t>::max();
|
uint64_t default_max = std::numeric_limits<uint64_t>::max();
|
||||||
uint64_t last_group_id = default_max;
|
uint64_t last_group_id = default_max;
|
||||||
bst_uint group_size = 0;
|
bst_uint group_size = 0;
|
||||||
auto& offset_vec = sparse_page_.offset.HostVector();
|
auto& offset_vec = sparse_page_->offset.HostVector();
|
||||||
auto& data_vec = sparse_page_.data.HostVector();
|
auto& data_vec = sparse_page_->data.HostVector();
|
||||||
uint64_t inferred_num_columns = 0;
|
uint64_t inferred_num_columns = 0;
|
||||||
uint64_t total_batch_size = 0;
|
uint64_t total_batch_size = 0;
|
||||||
// batch_size is either number of rows or cols, depending on data layout
|
// batch_size is either number of rows or cols, depending on data layout
|
||||||
@ -120,7 +120,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
|||||||
// Iterate over batches of input data
|
// Iterate over batches of input data
|
||||||
while (adapter->Next()) {
|
while (adapter->Next()) {
|
||||||
auto& batch = adapter->Value();
|
auto& batch = adapter->Value();
|
||||||
auto batch_max_columns = sparse_page_.Push(batch, missing, nthread);
|
auto batch_max_columns = sparse_page_->Push(batch, missing, nthread);
|
||||||
inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
|
inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
|
||||||
total_batch_size += batch.Size();
|
total_batch_size += batch.Size();
|
||||||
// Append meta information if available
|
// Append meta information if available
|
||||||
@ -203,8 +203,8 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
|
|||||||
CHECK(in_stream->Read(&tmagic)) << "invalid input file format";
|
CHECK(in_stream->Read(&tmagic)) << "invalid input file format";
|
||||||
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
|
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
|
||||||
info_.LoadBinary(in_stream);
|
info_.LoadBinary(in_stream);
|
||||||
in_stream->Read(&sparse_page_.offset.HostVector());
|
in_stream->Read(&sparse_page_->offset.HostVector());
|
||||||
in_stream->Read(&sparse_page_.data.HostVector());
|
in_stream->Read(&sparse_page_->data.HostVector());
|
||||||
}
|
}
|
||||||
|
|
||||||
void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
|
void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
|
||||||
@ -212,8 +212,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
|
|||||||
int tmagic = kMagic;
|
int tmagic = kMagic;
|
||||||
fo->Write(tmagic);
|
fo->Write(tmagic);
|
||||||
info_.SaveBinary(fo.get());
|
info_.SaveBinary(fo.get());
|
||||||
fo->Write(sparse_page_.offset.HostVector());
|
fo->Write(sparse_page_->offset.HostVector());
|
||||||
fo->Write(sparse_page_.data.HostVector());
|
fo->Write(sparse_page_->data.HostVector());
|
||||||
}
|
}
|
||||||
|
|
||||||
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
|
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
|
||||||
|
|||||||
@ -28,7 +28,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
|||||||
CHECK(!adapter->Next());
|
CHECK(!adapter->Next());
|
||||||
|
|
||||||
info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(),
|
info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(),
|
||||||
missing, &sparse_page_);
|
missing, sparse_page_.get());
|
||||||
info_.num_col_ = adapter->NumColumns();
|
info_.num_col_ = adapter->NumColumns();
|
||||||
info_.num_row_ = adapter->NumRows();
|
info_.num_row_ = adapter->NumRows();
|
||||||
// Synchronise worker columns
|
// Synchronise worker columns
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2015 by Contributors
|
* Copyright 2015-2021 by Contributors
|
||||||
* \file simple_dmatrix.h
|
* \file simple_dmatrix.h
|
||||||
* \brief In-memory version of DMatrix.
|
* \brief In-memory version of DMatrix.
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
@ -47,11 +47,12 @@ class SimpleDMatrix : public DMatrix {
|
|||||||
BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override;
|
BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override;
|
||||||
|
|
||||||
MetaInfo info_;
|
MetaInfo info_;
|
||||||
SparsePage sparse_page_; // Primary storage type
|
// Primary storage type
|
||||||
std::unique_ptr<CSCPage> column_page_;
|
std::shared_ptr<SparsePage> sparse_page_ = std::make_shared<SparsePage>();
|
||||||
std::unique_ptr<SortedCSCPage> sorted_column_page_;
|
std::shared_ptr<CSCPage> column_page_;
|
||||||
std::unique_ptr<EllpackPage> ellpack_page_;
|
std::shared_ptr<SortedCSCPage> sorted_column_page_;
|
||||||
std::unique_ptr<GHistIndexMatrix> gradient_index_;
|
std::shared_ptr<EllpackPage> ellpack_page_;
|
||||||
|
std::shared_ptr<GHistIndexMatrix> gradient_index_;
|
||||||
BatchParam batch_param_;
|
BatchParam batch_param_;
|
||||||
|
|
||||||
bool EllpackExists() const override {
|
bool EllpackExists() const override {
|
||||||
|
|||||||
@ -1,59 +1,147 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2014-2020 by Contributors
|
* Copyright 2014-2021 by Contributors
|
||||||
* \file sparse_page_dmatrix.cc
|
* \file sparse_page_dmatrix.cc
|
||||||
* \brief The external memory version of Page Iterator.
|
* \brief The external memory version of Page Iterator.
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
#include <dmlc/base.h>
|
|
||||||
#include <dmlc/timer.h>
|
|
||||||
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
|
||||||
#include "./sparse_page_dmatrix.h"
|
#include "./sparse_page_dmatrix.h"
|
||||||
|
|
||||||
#include "./simple_batch_iterator.h"
|
#include "./simple_batch_iterator.h"
|
||||||
|
#include "gradient_index.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
MetaInfo& SparsePageDMatrix::Info() {
|
MetaInfo &SparsePageDMatrix::Info() { return info_; }
|
||||||
return row_source_->info;
|
|
||||||
|
const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
|
||||||
|
|
||||||
|
SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy_handle,
|
||||||
|
DataIterResetCallback *reset,
|
||||||
|
XGDMatrixCallbackNext *next, float missing,
|
||||||
|
int32_t nthreads, std::string cache_prefix)
|
||||||
|
: proxy_{proxy_handle}, iter_{iter_handle}, reset_{reset}, next_{next}, missing_{missing},
|
||||||
|
nthreads_{nthreads}, cache_prefix_{std::move(cache_prefix)} {
|
||||||
|
cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
|
||||||
|
if (rabit::IsDistributed()) {
|
||||||
|
cache_prefix_ += ("-r" + std::to_string(rabit::GetRank()));
|
||||||
|
}
|
||||||
|
DMatrixProxy *proxy = MakeProxy(proxy_);
|
||||||
|
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
|
||||||
|
iter_, reset_, next_};
|
||||||
|
|
||||||
|
uint32_t n_batches = 0;
|
||||||
|
size_t n_features = 0;
|
||||||
|
size_t n_samples = 0;
|
||||||
|
size_t nnz = 0;
|
||||||
|
|
||||||
|
auto num_rows = [&]() {
|
||||||
|
return HostAdapterDispatch(
|
||||||
|
proxy, [](auto const &value) { return value.NumRows(); });
|
||||||
|
};
|
||||||
|
auto num_cols = [&]() {
|
||||||
|
return HostAdapterDispatch(
|
||||||
|
proxy, [](auto const &value) { return value.NumCols(); });
|
||||||
|
};
|
||||||
|
// the proxy is iterated together with the sparse page source so we can obtain all
|
||||||
|
// information in 1 pass.
|
||||||
|
for (auto const &page : this->GetRowBatchesImpl()) {
|
||||||
|
this->info_.Extend(std::move(proxy->Info()), false, false);
|
||||||
|
n_features = std::max(n_features, num_cols());
|
||||||
|
n_samples += num_rows();
|
||||||
|
nnz += page.data.Size();
|
||||||
|
n_batches++;
|
||||||
|
}
|
||||||
|
|
||||||
|
iter.Reset();
|
||||||
|
|
||||||
|
this->n_batches_ = n_batches;
|
||||||
|
this->info_.num_row_ = n_samples;
|
||||||
|
this->info_.num_col_ = n_features;
|
||||||
|
this->info_.num_nonzero_ = nnz;
|
||||||
|
|
||||||
|
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||||
|
CHECK_NE(info_.num_col_, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
const MetaInfo& SparsePageDMatrix::Info() const {
|
void SparsePageDMatrix::InitializeSparsePage() {
|
||||||
return row_source_->info;
|
auto id = MakeCache(this, ".row.page", cache_prefix_, &cache_info_);
|
||||||
|
// Don't use proxy DMatrix once this is already initialized, this allows users to
|
||||||
|
// release the iterator and data.
|
||||||
|
if (cache_info_.at(id)->written) {
|
||||||
|
CHECK(sparse_page_source_);
|
||||||
|
sparse_page_source_->Reset();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
|
||||||
|
iter_, reset_, next_};
|
||||||
|
DMatrixProxy *proxy = MakeProxy(proxy_);
|
||||||
|
sparse_page_source_.reset(); // clear before creating new one to prevent conflicts.
|
||||||
|
sparse_page_source_ = std::make_shared<SparsePageSource>(
|
||||||
|
iter, proxy, this->missing_, this->nthreads_, this->info_.num_col_,
|
||||||
|
this->n_batches_, cache_info_.at(id));
|
||||||
|
}
|
||||||
|
|
||||||
|
BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl() {
|
||||||
|
this->InitializeSparsePage();
|
||||||
|
auto begin_iter = BatchIterator<SparsePage>(sparse_page_source_);
|
||||||
|
return BatchSet<SparsePage>(BatchIterator<SparsePage>(begin_iter));
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
|
BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
|
||||||
return row_source_->GetBatchSet();
|
return this->GetRowBatchesImpl();
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
|
BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
|
||||||
// Lazily instantiate
|
auto id = MakeCache(this, ".col.page", cache_prefix_, &cache_info_);
|
||||||
|
CHECK_NE(this->Info().num_col_, 0);
|
||||||
|
this->InitializeSparsePage();
|
||||||
if (!column_source_) {
|
if (!column_source_) {
|
||||||
column_source_.reset(new CSCPageSource(this, cache_info_));
|
column_source_ = std::make_shared<CSCPageSource>(
|
||||||
|
this->missing_, this->nthreads_, this->Info().num_col_,
|
||||||
|
this->n_batches_, cache_info_.at(id), sparse_page_source_);
|
||||||
|
} else {
|
||||||
|
column_source_->Reset();
|
||||||
}
|
}
|
||||||
return column_source_->GetBatchSet();
|
auto begin_iter = BatchIterator<CSCPage>(column_source_);
|
||||||
|
return BatchSet<CSCPage>(BatchIterator<CSCPage>(begin_iter));
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
|
BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
|
||||||
// Lazily instantiate
|
auto id = MakeCache(this, ".sorted.col.page", cache_prefix_, &cache_info_);
|
||||||
|
CHECK_NE(this->Info().num_col_, 0);
|
||||||
|
this->InitializeSparsePage();
|
||||||
if (!sorted_column_source_) {
|
if (!sorted_column_source_) {
|
||||||
sorted_column_source_.reset(new SortedCSCPageSource(this, cache_info_));
|
sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
|
||||||
|
this->missing_, this->nthreads_, this->Info().num_col_,
|
||||||
|
this->n_batches_, cache_info_.at(id), sparse_page_source_);
|
||||||
|
} else {
|
||||||
|
sorted_column_source_->Reset();
|
||||||
}
|
}
|
||||||
return sorted_column_source_->GetBatchSet();
|
auto begin_iter = BatchIterator<SortedCSCPage>(sorted_column_source_);
|
||||||
|
return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(begin_iter));
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
|
BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam& param) {
|
||||||
CHECK_GE(param.gpu_id, 0);
|
|
||||||
CHECK_GE(param.max_bin, 2);
|
CHECK_GE(param.max_bin, 2);
|
||||||
// Lazily instantiate
|
// External memory is not support
|
||||||
if (!ellpack_source_ || (batch_param_ != param && param != BatchParam{})) {
|
if (!ghist_index_source_ || (param != batch_param_ && param != BatchParam{})) {
|
||||||
ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
|
this->InitializeSparsePage();
|
||||||
|
ghist_index_source_.reset(new GHistIndexMatrix{this, param.max_bin});
|
||||||
batch_param_ = param;
|
batch_param_ = param;
|
||||||
}
|
}
|
||||||
return ellpack_source_->GetBatchSet();
|
this->InitializeSparsePage();
|
||||||
|
auto begin_iter = BatchIterator<GHistIndexMatrix>(
|
||||||
|
new SimpleBatchIteratorImpl<GHistIndexMatrix>(ghist_index_source_));
|
||||||
|
return BatchSet<GHistIndexMatrix>(begin_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if !defined(XGBOOST_USE_CUDA)
|
||||||
|
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
|
||||||
|
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
|
||||||
|
}
|
||||||
|
#endif // !defined(XGBOOST_USE_CUDA)
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // DMLC_ENABLE_STD_THREAD
|
|
||||||
|
|||||||
46
src/data/sparse_page_dmatrix.cu
Normal file
46
src/data/sparse_page_dmatrix.cu
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2021 XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include "sparse_page_source.h"
|
||||||
|
#include "../common/hist_util.cuh"
|
||||||
|
#include "ellpack_page.cuh"
|
||||||
|
#include "sparse_page_dmatrix.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace data {
|
||||||
|
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
|
||||||
|
CHECK_GE(param.gpu_id, 0);
|
||||||
|
CHECK_GE(param.max_bin, 2);
|
||||||
|
auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
|
||||||
|
size_t row_stride = 0;
|
||||||
|
this->InitializeSparsePage();
|
||||||
|
if (!cache_info_.at(id)->written || (batch_param_ != param && param != BatchParam{})) {
|
||||||
|
// reinitialize the cache
|
||||||
|
cache_info_.erase(id);
|
||||||
|
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
|
||||||
|
std::unique_ptr<common::HistogramCuts> cuts;
|
||||||
|
cuts.reset(new common::HistogramCuts{
|
||||||
|
common::DeviceSketch(param.gpu_id, this, param.max_bin, 0)});
|
||||||
|
this->InitializeSparsePage(); // reset after use.
|
||||||
|
|
||||||
|
row_stride = GetRowStride(this);
|
||||||
|
this->InitializeSparsePage(); // reset after use.
|
||||||
|
CHECK_NE(row_stride, 0);
|
||||||
|
batch_param_ = param;
|
||||||
|
|
||||||
|
auto ft = this->info_.feature_types.ConstDeviceSpan();
|
||||||
|
ellpack_page_source_.reset(); // release resources.
|
||||||
|
ellpack_page_source_.reset(new EllpackPageSource(
|
||||||
|
this->missing_, this->nthreads_, this->Info().num_col_,
|
||||||
|
this->n_batches_, cache_info_.at(id), param, std::move(cuts),
|
||||||
|
this->IsDense(), row_stride, ft, sparse_page_source_));
|
||||||
|
} else {
|
||||||
|
CHECK(sparse_page_source_);
|
||||||
|
ellpack_page_source_->Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
|
||||||
|
return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
|
||||||
|
}
|
||||||
|
} // namespace data
|
||||||
|
} // namespace xgboost
|
||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2015 by Contributors
|
* Copyright 2015-2021 by Contributors
|
||||||
* \file sparse_page_dmatrix.h
|
* \file sparse_page_dmatrix.h
|
||||||
* \brief External-memory version of DMatrix.
|
* \brief External-memory version of DMatrix.
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
@ -13,24 +13,88 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#include "ellpack_page_source.h"
|
#include "ellpack_page_source.h"
|
||||||
#include "sparse_page_source.h"
|
#include "sparse_page_source.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
// Used for external memory.
|
/**
|
||||||
|
* \brief DMatrix used for external memory.
|
||||||
|
*
|
||||||
|
* The external memory is created for controlling memory usage by splitting up data into
|
||||||
|
* multiple batches. However that doesn't mean we will actually process exact 1 batch at
|
||||||
|
* a time, which would be terribly slow considering that we have to loop through the
|
||||||
|
* whole dataset for every tree split. So we use async pre-fetch and let caller to decide
|
||||||
|
* how many batches it wants to process by returning data as shared pointer. The caller
|
||||||
|
* can use async function to process the data or just stage those batches, making the
|
||||||
|
* decision is out of the scope for sparse page dmatrix. These 2 optimizations might
|
||||||
|
* defeat the purpose of splitting up dataset since if you load all the batches then the
|
||||||
|
* memory usage is even worse than using a single batch. Essentially we need to control
|
||||||
|
* how many batches can be in memory at the same time.
|
||||||
|
*
|
||||||
|
* Right now the write to the cache is sequential operation and is blocking, reading from
|
||||||
|
* cache is async but with a hard coded limit of 4 pages as an heuristic. So by sparse
|
||||||
|
* dmatrix itself there can be only 9 pages in main memory (might be of different types)
|
||||||
|
* at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
|
||||||
|
* dependent pages. If the caller stops iteration at the middle and start again, then the
|
||||||
|
* number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
|
||||||
|
* caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
|
||||||
|
* sampling algo that samples only the first portion of data).
|
||||||
|
*
|
||||||
|
* Of course if the caller decides to retain some batches to perform parallel processing,
|
||||||
|
* then we might load all pages in memory, which is also considered as a bug in caller's
|
||||||
|
* code. So if the algo supports external memory, it must be careful that queue for async
|
||||||
|
* call must have an upper limit.
|
||||||
|
*
|
||||||
|
* Another assumption we make is that the data must be immutable so caller should never
|
||||||
|
* change the data. Sparse page source returns const page to make sure of that. If you
|
||||||
|
* want to change the generated page like Ellpack, pass parameter into `GetBatches` to
|
||||||
|
* re-generate them instead of trying to modify the pages in-place.
|
||||||
|
*
|
||||||
|
* A possible optimization is dropping the sparse page once dependent pages like ellpack
|
||||||
|
* are constructed and cached.
|
||||||
|
*/
|
||||||
class SparsePageDMatrix : public DMatrix {
|
class SparsePageDMatrix : public DMatrix {
|
||||||
|
MetaInfo info_;
|
||||||
|
BatchParam batch_param_;
|
||||||
|
std::map<std::string, std::shared_ptr<Cache>> cache_info_;
|
||||||
|
|
||||||
|
DMatrixHandle proxy_;
|
||||||
|
DataIterHandle iter_;
|
||||||
|
DataIterResetCallback *reset_;
|
||||||
|
XGDMatrixCallbackNext *next_;
|
||||||
|
|
||||||
|
float missing_;
|
||||||
|
int nthreads_;
|
||||||
|
std::string cache_prefix_;
|
||||||
|
uint32_t n_batches_ {0};
|
||||||
|
// sparse page is the source to other page types, we make a special member function.
|
||||||
|
void InitializeSparsePage();
|
||||||
|
// Non-virtual version that can be used in constructor
|
||||||
|
BatchSet<SparsePage> GetRowBatchesImpl();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
template <typename AdapterT>
|
explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy,
|
||||||
explicit SparsePageDMatrix(AdapterT* adapter, float missing, int nthread,
|
DataIterResetCallback *reset,
|
||||||
const std::string& cache_prefix,
|
XGDMatrixCallbackNext *next, float missing,
|
||||||
size_t page_size = kPageSize)
|
int32_t nthreads, std::string cache_prefix);
|
||||||
: cache_info_(std::move(cache_prefix)) {
|
|
||||||
row_source_.reset(new data::SparsePageSource(adapter, missing, nthread,
|
~SparsePageDMatrix() override {
|
||||||
cache_prefix, page_size));
|
// Clear out all resources before deleting the cache file.
|
||||||
|
sparse_page_source_.reset();
|
||||||
|
ellpack_page_source_.reset();
|
||||||
|
column_source_.reset();
|
||||||
|
sorted_column_source_.reset();
|
||||||
|
ghist_index_source_.reset();
|
||||||
|
|
||||||
|
for (auto const &kv : cache_info_) {
|
||||||
|
CHECK(kv.second);
|
||||||
|
auto n = kv.second->ShardName();
|
||||||
|
TryDeleteCacheFile(n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
~SparsePageDMatrix() override = default;
|
|
||||||
|
|
||||||
MetaInfo& Info() override;
|
MetaInfo& Info() override;
|
||||||
|
|
||||||
@ -47,30 +111,41 @@ class SparsePageDMatrix : public DMatrix {
|
|||||||
BatchSet<CSCPage> GetColumnBatches() override;
|
BatchSet<CSCPage> GetColumnBatches() override;
|
||||||
BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
|
BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
|
||||||
BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
|
BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
|
||||||
BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override {
|
BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override;
|
||||||
LOG(FATAL) << "Not implemented.";
|
|
||||||
return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(nullptr));
|
|
||||||
}
|
|
||||||
|
|
||||||
// source data pointers.
|
// source data pointers.
|
||||||
std::unique_ptr<SparsePageSource> row_source_;
|
std::shared_ptr<SparsePageSource> sparse_page_source_;
|
||||||
std::unique_ptr<CSCPageSource> column_source_;
|
std::shared_ptr<EllpackPageSource> ellpack_page_source_;
|
||||||
std::unique_ptr<SortedCSCPageSource> sorted_column_source_;
|
std::shared_ptr<CSCPageSource> column_source_;
|
||||||
std::unique_ptr<EllpackPageSource> ellpack_source_;
|
std::shared_ptr<SortedCSCPageSource> sorted_column_source_;
|
||||||
// saved batch param
|
std::shared_ptr<GHistIndexMatrix> ghist_index_source_;
|
||||||
BatchParam batch_param_;
|
|
||||||
// the cache prefix
|
|
||||||
std::string cache_info_;
|
|
||||||
// Store column densities to avoid recalculating
|
|
||||||
std::vector<float> col_density_;
|
|
||||||
|
|
||||||
bool EllpackExists() const override {
|
bool EllpackExists() const override {
|
||||||
return static_cast<bool>(ellpack_source_);
|
return static_cast<bool>(ellpack_page_source_);
|
||||||
}
|
}
|
||||||
bool SparsePageExists() const override {
|
bool SparsePageExists() const override {
|
||||||
return static_cast<bool>(row_source_);
|
return static_cast<bool>(sparse_page_source_);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << ptr;
|
||||||
|
return prefix + "-" + ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string
|
||||||
|
MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
|
||||||
|
std::map<std::string, std::shared_ptr<Cache>> *out) {
|
||||||
|
auto &cache_info = *out;
|
||||||
|
auto name = MakeId(prefix, ptr);
|
||||||
|
auto id = name + format;
|
||||||
|
auto it = cache_info.find(id);
|
||||||
|
if (it == cache_info.cend()) {
|
||||||
|
cache_info[id].reset(new Cache{false, name, format});
|
||||||
|
}
|
||||||
|
return id;
|
||||||
|
}
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||||
|
|||||||
@ -1,77 +0,0 @@
|
|||||||
/*!
|
|
||||||
* Copyright (c) 2020 by XGBoost Contributors
|
|
||||||
*/
|
|
||||||
#include "sparse_page_source.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
|
||||||
namespace data {
|
|
||||||
void DataPool::Slice(std::shared_ptr<SparsePage> out, size_t offset,
|
|
||||||
size_t n_rows, size_t entry_offset) const {
|
|
||||||
auto const &in_offset = pool_.offset.HostVector();
|
|
||||||
auto const &in_data = pool_.data.HostVector();
|
|
||||||
auto &h_offset = out->offset.HostVector();
|
|
||||||
CHECK_LE(offset + n_rows + 1, in_offset.size());
|
|
||||||
h_offset.resize(n_rows + 1, 0);
|
|
||||||
std::transform(in_offset.cbegin() + offset,
|
|
||||||
in_offset.cbegin() + offset + n_rows + 1, h_offset.begin(),
|
|
||||||
[=](size_t ptr) { return ptr - entry_offset; });
|
|
||||||
|
|
||||||
auto &h_data = out->data.HostVector();
|
|
||||||
CHECK_GT(h_offset.size(), 0);
|
|
||||||
size_t n_entries = h_offset.back();
|
|
||||||
h_data.resize(n_entries);
|
|
||||||
|
|
||||||
CHECK_EQ(n_entries, in_offset.at(offset + n_rows) - in_offset.at(offset));
|
|
||||||
std::copy_n(in_data.cbegin() + in_offset.at(offset), n_entries,
|
|
||||||
h_data.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
void DataPool::SplitWritePage() {
|
|
||||||
size_t total = pool_.Size();
|
|
||||||
size_t offset = 0;
|
|
||||||
size_t entry_offset = 0;
|
|
||||||
do {
|
|
||||||
size_t n_rows = std::min(page_size_, total - offset);
|
|
||||||
std::shared_ptr<SparsePage> out;
|
|
||||||
writer_->Alloc(&out);
|
|
||||||
out->Clear();
|
|
||||||
out->SetBaseRowId(inferred_num_rows_);
|
|
||||||
this->Slice(out, offset, n_rows, entry_offset);
|
|
||||||
inferred_num_rows_ += out->Size();
|
|
||||||
offset += n_rows;
|
|
||||||
entry_offset += out->data.Size();
|
|
||||||
CHECK_NE(out->Size(), 0);
|
|
||||||
writer_->PushWrite(std::move(out));
|
|
||||||
} while (total - offset >= page_size_);
|
|
||||||
|
|
||||||
if (total - offset != 0) {
|
|
||||||
auto out = std::make_shared<SparsePage>();
|
|
||||||
this->Slice(out, offset, total - offset, entry_offset);
|
|
||||||
CHECK_NE(out->Size(), 0);
|
|
||||||
pool_.Clear();
|
|
||||||
pool_.Push(*out);
|
|
||||||
} else {
|
|
||||||
pool_.Clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
size_t DataPool::Finalize() {
|
|
||||||
inferred_num_rows_ += pool_.Size();
|
|
||||||
if (pool_.Size() != 0) {
|
|
||||||
std::shared_ptr<SparsePage> page;
|
|
||||||
this->writer_->Alloc(&page);
|
|
||||||
page->Clear();
|
|
||||||
page->Push(pool_);
|
|
||||||
this->writer_->PushWrite(std::move(page));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inferred_num_rows_ == 0) {
|
|
||||||
std::shared_ptr<SparsePage> page;
|
|
||||||
this->writer_->Alloc(&page);
|
|
||||||
page->Clear();
|
|
||||||
this->writer_->PushWrite(std::move(page));
|
|
||||||
}
|
|
||||||
|
|
||||||
return inferred_num_rows_;
|
|
||||||
}
|
|
||||||
} // namespace data
|
|
||||||
} // namespace xgboost
|
|
||||||
17
src/data/sparse_page_source.cu
Normal file
17
src/data/sparse_page_source.cu
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2021 XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include "sparse_page_source.h"
|
||||||
|
#include "proxy_dmatrix.cuh"
|
||||||
|
#include "simple_dmatrix.cuh"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace data {
|
||||||
|
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
|
||||||
|
auto device = proxy->DeviceIdx();
|
||||||
|
Dispatch(proxy, [&](auto const &value) {
|
||||||
|
CopyToSparsePage(value, device, missing, page);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} // namespace data
|
||||||
|
} // namespace xgboost
|
||||||
@ -1,54 +1,18 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright (c) 2014-2019 by Contributors
|
* Copyright (c) 2014-2021 by Contributors
|
||||||
* \file page_csr_source.h
|
* \file sparse_page_source.h
|
||||||
* External memory data source, saved with sparse_batch_page binary format.
|
|
||||||
* \author Tianqi Chen
|
|
||||||
*
|
|
||||||
* -------------------------------------------------
|
|
||||||
* Random notes on implementation of external memory
|
|
||||||
* -------------------------------------------------
|
|
||||||
*
|
|
||||||
* As of XGBoost 1.3, the general pipeline is:
|
|
||||||
*
|
|
||||||
* dmlc text file parser --> file adapter --> sparse page source -> data pool -->
|
|
||||||
* write to binary cache --> load it back ~~> [ other pages (csc, ellpack, sorted csc) -->
|
|
||||||
* write to binary cache ] --> use it in various algorithms.
|
|
||||||
*
|
|
||||||
* ~~> means optional
|
|
||||||
*
|
|
||||||
* The dmlc text file parser returns number of blocks based on available threads, which
|
|
||||||
* can make the data partitioning non-deterministic, so here we set up an extra data pool
|
|
||||||
* to stage parsed data. As a result, the number of blocks returned by text parser does
|
|
||||||
* not equal to number of blocks in binary cache.
|
|
||||||
*
|
|
||||||
* Binary cache loading is async by the dmlc threaded iterator, which helps performance,
|
|
||||||
* but as this iterator itself is not thread safe, so calling
|
|
||||||
* `dmatrix->GetBatches<SparsePage>` is also not thread safe. Please note that, the
|
|
||||||
* threaded iterator is also used inside dmlc text file parser.
|
|
||||||
*
|
|
||||||
* Memory consumption is difficult to control due to various reasons. Firstly the text
|
|
||||||
* parsing doesn't have a batch size, only a hard coded buffer size is available.
|
|
||||||
* Secondly, everything is loaded/written with async queue, with multiple queues running
|
|
||||||
* the memory consumption is difficult to measure.
|
|
||||||
*
|
|
||||||
* The threaded iterator relies heavily on C++ memory model and threading primitive. The
|
|
||||||
* concurrent writer for binary cache is an old copy of moody queue. We should try to
|
|
||||||
* replace them with something more robust.
|
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
#ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||||
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||||
|
|
||||||
#include <dmlc/threadediter.h>
|
#include <algorithm> // std::min
|
||||||
#include <dmlc/timer.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <limits>
|
|
||||||
#include <locale>
|
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <fstream>
|
#include <future>
|
||||||
|
#include <thread>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#include "rabit/rabit.h"
|
#include "rabit/rabit.h"
|
||||||
#include "xgboost/base.h"
|
#include "xgboost/base.h"
|
||||||
@ -56,93 +20,12 @@
|
|||||||
|
|
||||||
#include "adapter.h"
|
#include "adapter.h"
|
||||||
#include "sparse_page_writer.h"
|
#include "sparse_page_writer.h"
|
||||||
|
#include "proxy_dmatrix.h"
|
||||||
|
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
#include <xgboost/data.h>
|
|
||||||
|
|
||||||
namespace detail {
|
|
||||||
|
|
||||||
// Split a cache info string with delimiter ':'
|
|
||||||
// If cache info string contains drive letter (e.g. C:), exclude it before splitting
|
|
||||||
inline std::vector<std::string>
|
|
||||||
GetCacheShards(const std::string& cache_info) {
|
|
||||||
#if (defined _WIN32) || (defined __CYGWIN__)
|
|
||||||
if (cache_info.length() >= 2
|
|
||||||
&& std::isalpha(cache_info[0], std::locale::classic())
|
|
||||||
&& cache_info[1] == ':') {
|
|
||||||
std::vector<std::string> cache_shards
|
|
||||||
= xgboost::common::Split(cache_info.substr(2), ':');
|
|
||||||
cache_shards[0] = cache_info.substr(0, 2) + cache_shards[0];
|
|
||||||
return cache_shards;
|
|
||||||
}
|
|
||||||
#endif // (defined _WIN32) || (defined __CYGWIN__)
|
|
||||||
return xgboost::common::Split(cache_info, ':');
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace detail
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
template<typename S, typename T>
|
|
||||||
class SparseBatchIteratorImpl : public BatchIteratorImpl<T> {
|
|
||||||
public:
|
|
||||||
explicit SparseBatchIteratorImpl(S* source) : source_(source) {
|
|
||||||
CHECK(source_ != nullptr);
|
|
||||||
source_->BeforeFirst();
|
|
||||||
source_->Next();
|
|
||||||
}
|
|
||||||
T& operator*() override { return source_->Value(); }
|
|
||||||
const T& operator*() const override { return source_->Value(); }
|
|
||||||
void operator++() override { at_end_ = !source_->Next(); }
|
|
||||||
bool AtEnd() const override { return at_end_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
S* source_{nullptr};
|
|
||||||
bool at_end_{ false };
|
|
||||||
};
|
|
||||||
|
|
||||||
/*! \brief magic number used to identify Page */
|
|
||||||
static const int kMagic = 0xffffab02;
|
|
||||||
/*!
|
|
||||||
* \brief decide the format from cache prefix.
|
|
||||||
* \return pair of row format, column format type of the cache prefix.
|
|
||||||
*/
|
|
||||||
inline std::pair<std::string, std::string> DecideFormat(const std::string& cache_prefix) {
|
|
||||||
size_t pos = cache_prefix.rfind(".fmt-");
|
|
||||||
|
|
||||||
if (pos != std::string::npos) {
|
|
||||||
std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
|
|
||||||
size_t cpos = fmt.rfind('-');
|
|
||||||
if (cpos != std::string::npos) {
|
|
||||||
return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
|
|
||||||
} else {
|
|
||||||
return std::make_pair(fmt, fmt);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
std::string raw = "raw";
|
|
||||||
return std::make_pair(raw, raw);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct CacheInfo {
|
|
||||||
std::string name_info;
|
|
||||||
std::vector<std::string> format_shards;
|
|
||||||
std::vector<std::string> name_shards;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline CacheInfo ParseCacheInfo(const std::string& cache_info, const std::string& page_type) {
|
|
||||||
CacheInfo info;
|
|
||||||
std::vector<std::string> cache_shards = ::detail::GetCacheShards(cache_info);
|
|
||||||
CHECK_NE(cache_shards.size(), 0U);
|
|
||||||
// read in the info files.
|
|
||||||
info.name_info = cache_shards[0];
|
|
||||||
for (const std::string& prefix : cache_shards) {
|
|
||||||
info.name_shards.push_back(prefix + page_type);
|
|
||||||
info.format_shards.push_back(DecideFormat(prefix).first);
|
|
||||||
}
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TryDeleteCacheFile(const std::string& file) {
|
inline void TryDeleteCacheFile(const std::string& file) {
|
||||||
if (std::remove(file.c_str()) != 0) {
|
if (std::remove(file.c_str()) != 0) {
|
||||||
LOG(WARNING) << "Couldn't remove external memory cache file " << file
|
LOG(WARNING) << "Couldn't remove external memory cache file " << file
|
||||||
@ -150,415 +33,327 @@ inline void TryDeleteCacheFile(const std::string& file) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void CheckCacheFileExists(const std::string& file) {
|
struct Cache {
|
||||||
std::ifstream f(file.c_str());
|
// whether the write to the cache is complete
|
||||||
if (f.good()) {
|
bool written;
|
||||||
LOG(FATAL)
|
std::string name;
|
||||||
<< "Cache file " << file << " exists already; "
|
|
||||||
<< "Is there another DMatrix with the same "
|
|
||||||
"cache prefix? It can be caused by previously used DMatrix that "
|
|
||||||
"hasn't been collected by language environment garbage collector. "
|
|
||||||
"Otherwise please remove it manually.";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Given a set of cache files and page type, this object iterates over batches
|
|
||||||
* using prefetching for improved performance. Not thread safe.
|
|
||||||
*
|
|
||||||
* \tparam PageT Type of the page t.
|
|
||||||
*/
|
|
||||||
template <typename PageT>
|
|
||||||
class ExternalMemoryPrefetcher : dmlc::DataIter<PageT> {
|
|
||||||
public:
|
|
||||||
explicit ExternalMemoryPrefetcher(const CacheInfo& info) noexcept(false)
|
|
||||||
: base_rowid_(0), page_(nullptr), clock_ptr_(0) {
|
|
||||||
// read in the info files
|
|
||||||
CHECK_NE(info.name_shards.size(), 0U);
|
|
||||||
{
|
|
||||||
std::unique_ptr<dmlc::Stream> finfo(
|
|
||||||
dmlc::Stream::Create(info.name_info.c_str(), "r"));
|
|
||||||
int tmagic;
|
|
||||||
CHECK(finfo->Read(&tmagic));
|
|
||||||
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
|
|
||||||
}
|
|
||||||
files_.resize(info.name_shards.size());
|
|
||||||
formats_.resize(info.name_shards.size());
|
|
||||||
prefetchers_.resize(info.name_shards.size());
|
|
||||||
|
|
||||||
// read in the cache files.
|
|
||||||
for (size_t i = 0; i < info.name_shards.size(); ++i) {
|
|
||||||
std::string name_row = info.name_shards.at(i);
|
|
||||||
files_[i].reset(dmlc::SeekStream::CreateForRead(name_row.c_str()));
|
|
||||||
std::unique_ptr<dmlc::SeekStream>& fi = files_[i];
|
|
||||||
std::string format;
|
std::string format;
|
||||||
CHECK(fi->Read(&format)) << "Invalid page format";
|
// offset into binary cache file.
|
||||||
formats_[i].reset(CreatePageFormat<PageT>(format));
|
std::vector<size_t> offset;
|
||||||
std::unique_ptr<SparsePageFormat<PageT>>& fmt = formats_[i];
|
|
||||||
size_t fbegin = fi->Tell();
|
Cache(bool w, std::string n, std::string fmt)
|
||||||
prefetchers_[i].reset(new dmlc::ThreadedIter<PageT>(4));
|
: written{w}, name{std::move(n)}, format{std::move(fmt)} {
|
||||||
prefetchers_[i]->Init(
|
offset.push_back(0);
|
||||||
[&fi, &fmt](PageT** dptr) {
|
|
||||||
if (*dptr == nullptr) {
|
|
||||||
*dptr = new PageT();
|
|
||||||
}
|
|
||||||
return fmt->Read(*dptr, fi.get());
|
|
||||||
},
|
|
||||||
[&fi, fbegin]() { fi->Seek(fbegin); });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*! \brief destructor */
|
|
||||||
~ExternalMemoryPrefetcher() override {
|
|
||||||
delete page_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// implement Next
|
static std::string ShardName(std::string name, std::string format) {
|
||||||
bool Next() override {
|
CHECK_EQ(format.front(), '.');
|
||||||
CHECK(mutex_.try_lock()) << "Multiple threads attempting to use prefetcher";
|
return name + format;
|
||||||
// doing clock rotation over shards.
|
|
||||||
if (page_ != nullptr) {
|
|
||||||
size_t n = prefetchers_.size();
|
|
||||||
prefetchers_[(clock_ptr_ + n - 1) % n]->Recycle(&page_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prefetchers_[clock_ptr_]->Next(&page_)) {
|
std::string ShardName() {
|
||||||
page_->SetBaseRowId(base_rowid_);
|
return ShardName(this->name, this->format);
|
||||||
base_rowid_ += page_->Size();
|
}
|
||||||
// advance clock
|
|
||||||
clock_ptr_ = (clock_ptr_ + 1) % prefetchers_.size();
|
// The write is completed.
|
||||||
mutex_.unlock();
|
void Commit() {
|
||||||
return true;
|
if (!written) {
|
||||||
} else {
|
std::partial_sum(offset.begin(), offset.end(), offset.begin());
|
||||||
mutex_.unlock();
|
written = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Prevents multi-threaded call.
|
||||||
|
class TryLockGuard {
|
||||||
|
std::mutex& lock_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit TryLockGuard(std::mutex& lock) : lock_{lock} { // NOLINT
|
||||||
|
CHECK(lock_.try_lock()) << "Multiple threads attempting to use Sparse DMatrix.";
|
||||||
|
}
|
||||||
|
~TryLockGuard() {
|
||||||
|
lock_.unlock();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename S>
|
||||||
|
class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||||
|
protected:
|
||||||
|
// Prevents calling this iterator from multiple places(or threads).
|
||||||
|
std::mutex single_threaded_;
|
||||||
|
|
||||||
|
std::shared_ptr<S> page_;
|
||||||
|
|
||||||
|
bool at_end_ {false};
|
||||||
|
float missing_;
|
||||||
|
int nthreads_;
|
||||||
|
bst_feature_t n_features_;
|
||||||
|
|
||||||
|
uint32_t count_{0};
|
||||||
|
|
||||||
|
uint32_t n_batches_ {0};
|
||||||
|
|
||||||
|
std::shared_ptr<Cache> cache_info_;
|
||||||
|
std::unique_ptr<dmlc::Stream> fo_;
|
||||||
|
|
||||||
|
using Ring = std::vector<std::future<std::shared_ptr<S>>>;
|
||||||
|
// A ring storing futures to data. Since the DMatrix iterator is forward only, so we
|
||||||
|
// can pre-fetch data in a ring.
|
||||||
|
std::unique_ptr<Ring> ring_{new Ring};
|
||||||
|
|
||||||
|
bool ReadCache() {
|
||||||
|
CHECK(!at_end_);
|
||||||
|
if (!cache_info_->written) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (fo_) {
|
||||||
|
fo_.reset(); // flush the data to disk.
|
||||||
|
ring_->resize(n_batches_);
|
||||||
|
}
|
||||||
|
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
|
||||||
|
// to let user adjust number of pre-fetched batches when needed.
|
||||||
|
uint32_t constexpr kPreFetch = 4;
|
||||||
|
|
||||||
|
size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
|
||||||
|
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
|
||||||
|
size_t fetch_it = count_;
|
||||||
|
for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
|
||||||
|
fetch_it %= n_batches_; // ring
|
||||||
|
if (ring_->at(fetch_it).valid()) { continue; }
|
||||||
|
auto const *self = this; // make sure it's const
|
||||||
|
CHECK_LT(fetch_it, cache_info_->offset.size());
|
||||||
|
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
|
||||||
|
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
|
||||||
|
auto n = self->cache_info_->ShardName();
|
||||||
|
size_t offset = self->cache_info_->offset.at(fetch_it);
|
||||||
|
std::unique_ptr<dmlc::SeekStream> fi{
|
||||||
|
dmlc::SeekStream::CreateForRead(n.c_str())};
|
||||||
|
fi->Seek(offset);
|
||||||
|
CHECK_EQ(fi->Tell(), offset);
|
||||||
|
auto page = std::make_shared<S>();
|
||||||
|
CHECK(fmt->Read(page.get(), fi.get()));
|
||||||
|
return page;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(),
|
||||||
|
[](auto const &f) { return f.valid(); }),
|
||||||
|
n_prefetch_batches)
|
||||||
|
<< "Sparse DMatrix assumes forward iteration.";
|
||||||
|
page_ = (*ring_)[count_].get();
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// implement BeforeFirst
|
void WriteCache() {
|
||||||
void BeforeFirst() override {
|
CHECK(!cache_info_->written);
|
||||||
CHECK(mutex_.try_lock()) << "Multiple threads attempting to use prefetcher";
|
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
|
||||||
base_rowid_ = 0;
|
if (!fo_) {
|
||||||
clock_ptr_ = 0;
|
auto n = cache_info_->ShardName();
|
||||||
for (auto& p : prefetchers_) {
|
fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
|
||||||
p->BeforeFirst();
|
|
||||||
}
|
}
|
||||||
mutex_.unlock();
|
auto bytes = fmt->Write(*page_, fo_.get());
|
||||||
|
cache_info_->offset.push_back(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// implement Value
|
virtual void Fetch() = 0;
|
||||||
PageT& Value() { return *page_; }
|
|
||||||
|
|
||||||
const PageT& Value() const override { return *page_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::mutex mutex_;
|
|
||||||
/*! \brief number of rows */
|
|
||||||
size_t base_rowid_;
|
|
||||||
/*! \brief page currently on hold. */
|
|
||||||
PageT* page_;
|
|
||||||
/*! \brief internal clock ptr */
|
|
||||||
size_t clock_ptr_;
|
|
||||||
/*! \brief file pointer to the row blob file. */
|
|
||||||
std::vector<std::unique_ptr<dmlc::SeekStream>> files_;
|
|
||||||
/*! \brief Sparse page format file. */
|
|
||||||
std::vector<std::unique_ptr<SparsePageFormat<PageT>>> formats_;
|
|
||||||
/*! \brief internal prefetcher. */
|
|
||||||
std::vector<std::unique_ptr<dmlc::ThreadedIter<PageT>>> prefetchers_;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// A data pool to keep the size of each page balanced and data partitioning to be
|
|
||||||
// deterministic.
|
|
||||||
class DataPool {
|
|
||||||
size_t inferred_num_rows_;
|
|
||||||
MetaInfo* info_;
|
|
||||||
SparsePage pool_;
|
|
||||||
size_t page_size_;
|
|
||||||
SparsePageWriter<SparsePage> *writer_;
|
|
||||||
|
|
||||||
void Slice(std::shared_ptr<SparsePage> out, size_t offset, size_t n_rows,
|
|
||||||
size_t entry_offset) const;
|
|
||||||
void SplitWritePage();
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
DataPool(MetaInfo *info, size_t page_size,
|
SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
|
||||||
SparsePageWriter<SparsePage> *writer)
|
uint32_t n_batches, std::shared_ptr<Cache> cache)
|
||||||
: inferred_num_rows_{0}, info_{info},
|
: missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
|
||||||
page_size_{page_size}, writer_{writer} {}
|
n_batches_{n_batches}, cache_info_{std::move(cache)} {}
|
||||||
|
|
||||||
void Push(std::shared_ptr<SparsePage> page) {
|
SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
|
||||||
info_->num_nonzero_ += page->data.Size();
|
|
||||||
pool_.Push(*page);
|
~SparsePageSourceImpl() override {
|
||||||
if (pool_.Size() > page_size_) {
|
for (auto& fu : *ring_) {
|
||||||
this->SplitWritePage();
|
if (fu.valid()) {
|
||||||
|
fu.get();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
page->Clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Finalize();
|
uint32_t Iter() const { return count_; }
|
||||||
|
|
||||||
|
const S &operator*() const override {
|
||||||
|
CHECK(page_);
|
||||||
|
return *page_;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<S const> Page() const override {
|
||||||
|
return page_;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AtEnd() const override {
|
||||||
|
return at_end_;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void Reset() {
|
||||||
|
TryLockGuard guard{single_threaded_};
|
||||||
|
at_end_ = false;
|
||||||
|
count_ = 0;
|
||||||
|
this->Fetch();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class SparsePageSource {
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
|
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
|
||||||
|
#else
|
||||||
|
inline void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
|
||||||
|
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
|
||||||
|
DMatrixProxy* proxy_;
|
||||||
|
size_t base_row_id_ {0};
|
||||||
|
|
||||||
|
void Fetch() final {
|
||||||
|
page_ = std::make_shared<SparsePage>();
|
||||||
|
if (!this->ReadCache()) {
|
||||||
|
bool type_error { false };
|
||||||
|
CHECK(proxy_);
|
||||||
|
HostAdapterDispatch(proxy_, [&](auto const &adapter_batch) {
|
||||||
|
page_->Push(adapter_batch, this->missing_, this->nthreads_);
|
||||||
|
}, &type_error);
|
||||||
|
if (type_error) {
|
||||||
|
DevicePush(proxy_, missing_, page_.get());
|
||||||
|
}
|
||||||
|
page_->SetBaseRowId(base_row_id_);
|
||||||
|
base_row_id_ += page_->Size();
|
||||||
|
n_batches_++;
|
||||||
|
this->WriteCache();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
template <typename AdapterT>
|
SparsePageSource(
|
||||||
SparsePageSource(AdapterT* adapter, float missing, int nthread,
|
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter,
|
||||||
const std::string& cache_info,
|
DMatrixProxy *proxy, float missing, int nthreads,
|
||||||
const size_t page_size = DMatrix::kPageSize) {
|
bst_feature_t n_features, uint32_t n_batches, std::shared_ptr<Cache> cache)
|
||||||
const std::string page_type = ".row.page";
|
: SparsePageSourceImpl(missing, nthreads, n_features, n_batches, cache),
|
||||||
cache_info_ = ParseCacheInfo(cache_info, page_type);
|
iter_{iter}, proxy_{proxy} {
|
||||||
|
if (!cache_info_->written) {
|
||||||
// Warn user if old cache files
|
iter_.Reset();
|
||||||
CheckCacheFileExists(cache_info_.name_info);
|
iter_.Next();
|
||||||
for (auto file : cache_info_.name_shards) {
|
}
|
||||||
CheckCacheFileExists(file);
|
this->Fetch();
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
SparsePageSource& operator++() final {
|
||||||
SparsePageWriter<SparsePage> writer(cache_info_.name_shards,
|
TryLockGuard guard{single_threaded_};
|
||||||
cache_info_.format_shards, 6);
|
count_++;
|
||||||
DataPool pool(&info, page_size, &writer);
|
if (cache_info_->written) {
|
||||||
|
at_end_ = (count_ == n_batches_);
|
||||||
std::shared_ptr<SparsePage> page { new SparsePage };
|
|
||||||
|
|
||||||
uint64_t inferred_num_columns = 0;
|
|
||||||
uint64_t inferred_num_rows = 0;
|
|
||||||
|
|
||||||
const uint64_t default_max = std::numeric_limits<uint64_t>::max();
|
|
||||||
uint64_t last_group_id = default_max;
|
|
||||||
bst_uint group_size = 0;
|
|
||||||
std::vector<uint64_t> qids;
|
|
||||||
adapter->BeforeFirst();
|
|
||||||
while (adapter->Next()) {
|
|
||||||
auto& batch = adapter->Value();
|
|
||||||
if (batch.Labels() != nullptr) {
|
|
||||||
auto& labels = info.labels_.HostVector();
|
|
||||||
labels.insert(labels.end(), batch.Labels(),
|
|
||||||
batch.Labels() + batch.Size());
|
|
||||||
}
|
|
||||||
if (batch.Weights() != nullptr) {
|
|
||||||
auto& weights = info.weights_.HostVector();
|
|
||||||
weights.insert(weights.end(), batch.Weights(),
|
|
||||||
batch.Weights() + batch.Size());
|
|
||||||
}
|
|
||||||
if (batch.BaseMargin() != nullptr) {
|
|
||||||
auto& base_margin = info.base_margin_.HostVector();
|
|
||||||
base_margin.insert(base_margin.end(), batch.BaseMargin(),
|
|
||||||
batch.BaseMargin() + batch.Size());
|
|
||||||
}
|
|
||||||
if (batch.Qid() != nullptr) {
|
|
||||||
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
|
|
||||||
// get group
|
|
||||||
for (size_t i = 0; i < batch.Size(); ++i) {
|
|
||||||
const uint64_t cur_group_id = batch.Qid()[i];
|
|
||||||
if (last_group_id == default_max ||
|
|
||||||
last_group_id != cur_group_id) {
|
|
||||||
info.group_ptr_.push_back(group_size);
|
|
||||||
}
|
|
||||||
last_group_id = cur_group_id;
|
|
||||||
++group_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CHECK_EQ(page->Size(), 0);
|
|
||||||
auto batch_max_columns = page->Push(batch, missing, nthread);
|
|
||||||
inferred_num_columns =
|
|
||||||
std::max(batch_max_columns, inferred_num_columns);
|
|
||||||
inferred_num_rows += page->Size();
|
|
||||||
pool.Push(page);
|
|
||||||
page->SetBaseRowId(inferred_num_rows);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (last_group_id != default_max) {
|
|
||||||
if (group_size > info.group_ptr_.back()) {
|
|
||||||
info.group_ptr_.push_back(group_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deal with empty rows/columns if necessary
|
|
||||||
if (adapter->NumColumns() == kAdapterUnknownSize) {
|
|
||||||
info.num_col_ = inferred_num_columns;
|
|
||||||
} else {
|
} else {
|
||||||
info.num_col_ = adapter->NumColumns();
|
at_end_ = !iter_.Next();
|
||||||
}
|
}
|
||||||
// Synchronise worker columns
|
|
||||||
rabit::Allreduce<rabit::op::Max>(&info.num_col_, 1);
|
|
||||||
|
|
||||||
if (adapter->NumRows() == kAdapterUnknownSize) {
|
if (at_end_) {
|
||||||
info.num_row_ = inferred_num_rows;
|
cache_info_->Commit();
|
||||||
|
if (n_batches_ != 0) {
|
||||||
|
CHECK_EQ(count_, n_batches_);
|
||||||
|
}
|
||||||
|
CHECK_GE(count_, 1);
|
||||||
|
proxy_ = nullptr;
|
||||||
} else {
|
} else {
|
||||||
if (page->offset.HostVector().empty()) {
|
this->Fetch();
|
||||||
page->offset.HostVector().emplace_back(0);
|
}
|
||||||
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
while (inferred_num_rows < adapter->NumRows()) {
|
void Reset() override {
|
||||||
page->offset.HostVector().emplace_back(
|
if (proxy_) {
|
||||||
page->offset.HostVector().back());
|
TryLockGuard guard{single_threaded_};
|
||||||
inferred_num_rows++;
|
iter_.Reset();
|
||||||
}
|
|
||||||
info.num_row_ = adapter->NumRows();
|
|
||||||
}
|
}
|
||||||
|
SparsePageSourceImpl::Reset();
|
||||||
|
|
||||||
pool.Push(page);
|
TryLockGuard guard{single_threaded_};
|
||||||
pool.Finalize();
|
base_row_id_ = 0;
|
||||||
|
|
||||||
std::unique_ptr<dmlc::Stream> fo(
|
|
||||||
dmlc::Stream::Create(cache_info_.name_info.c_str(), "w"));
|
|
||||||
int tmagic = kMagic;
|
|
||||||
fo->Write(tmagic);
|
|
||||||
// Either every row has query ID or none at all
|
|
||||||
CHECK(qids.empty() || qids.size() == info.num_row_);
|
|
||||||
info.SaveBinary(fo.get());
|
|
||||||
}
|
}
|
||||||
LOG(INFO) << "SparsePageSource Finished writing to "
|
|
||||||
<< cache_info_.name_info;
|
|
||||||
|
|
||||||
external_prefetcher_.reset(
|
|
||||||
new ExternalMemoryPrefetcher<SparsePage>(cache_info_));
|
|
||||||
}
|
|
||||||
|
|
||||||
~SparsePageSource() {
|
|
||||||
external_prefetcher_.reset();
|
|
||||||
TryDeleteCacheFile(cache_info_.name_info);
|
|
||||||
for (auto file : cache_info_.name_shards) {
|
|
||||||
TryDeleteCacheFile(file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BatchSet<SparsePage> GetBatchSet() {
|
|
||||||
auto begin_iter = BatchIterator<SparsePage>(
|
|
||||||
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<SparsePage>,
|
|
||||||
SparsePage>(external_prefetcher_.get()));
|
|
||||||
return BatchSet<SparsePage>(begin_iter);
|
|
||||||
}
|
|
||||||
MetaInfo info;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<ExternalMemoryPrefetcher<SparsePage>> external_prefetcher_;
|
|
||||||
CacheInfo cache_info_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class CSCPageSource {
|
// A mixin for advancing the iterator.
|
||||||
|
template <typename S>
|
||||||
|
class PageSourceIncMixIn : public SparsePageSourceImpl<S> {
|
||||||
|
protected:
|
||||||
|
std::shared_ptr<SparsePageSource> source_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CSCPageSource(DMatrix* src, const std::string& cache_info,
|
using SparsePageSourceImpl<S>::SparsePageSourceImpl;
|
||||||
const size_t page_size = DMatrix::kPageSize) {
|
PageSourceIncMixIn& operator++() final {
|
||||||
std::string page_type = ".col.page";
|
TryLockGuard guard{this->single_threaded_};
|
||||||
cache_info_ = ParseCacheInfo(cache_info, page_type);
|
++(*source_);
|
||||||
for (auto file : cache_info_.name_shards) {
|
|
||||||
CheckCacheFileExists(file);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
SparsePageWriter<SparsePage> writer(cache_info_.name_shards,
|
|
||||||
cache_info_.format_shards, 6);
|
|
||||||
std::shared_ptr<SparsePage> page;
|
|
||||||
writer.Alloc(&page);
|
|
||||||
page->Clear();
|
|
||||||
|
|
||||||
size_t bytes_write = 0;
|
++this->count_;
|
||||||
double tstart = dmlc::GetTime();
|
this->at_end_ = source_->AtEnd();
|
||||||
for (auto& batch : src->GetBatches<SparsePage>()) {
|
|
||||||
page->PushCSC(batch.GetTranspose(src->Info().num_col_));
|
|
||||||
|
|
||||||
if (page->MemCostBytes() >= page_size) {
|
if (this->at_end_) {
|
||||||
bytes_write += page->MemCostBytes();
|
this->cache_info_->Commit();
|
||||||
writer.PushWrite(std::move(page));
|
if (this->n_batches_ != 0) {
|
||||||
writer.Alloc(&page);
|
CHECK_EQ(this->count_, this->n_batches_);
|
||||||
page->Clear();
|
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
|
||||||
LOG(INFO) << "Writing to " << cache_info << " in "
|
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
|
||||||
<< (bytes_write >> 20UL) << " written";
|
|
||||||
}
|
}
|
||||||
|
CHECK_GE(this->count_, 1);
|
||||||
|
} else {
|
||||||
|
this->Fetch();
|
||||||
}
|
}
|
||||||
if (page->data.Size() != 0) {
|
CHECK_EQ(source_->Iter(), this->count_);
|
||||||
writer.PushWrite(std::move(page));
|
return *this;
|
||||||
}
|
}
|
||||||
LOG(INFO) << "CSCPageSource: Finished writing to "
|
|
||||||
<< cache_info_.name_info;
|
|
||||||
}
|
|
||||||
external_prefetcher_.reset(
|
|
||||||
new ExternalMemoryPrefetcher<CSCPage>(cache_info_));
|
|
||||||
}
|
|
||||||
|
|
||||||
~CSCPageSource() {
|
|
||||||
external_prefetcher_.reset();
|
|
||||||
for (auto file : cache_info_.name_shards) {
|
|
||||||
TryDeleteCacheFile(file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BatchSet<CSCPage> GetBatchSet() {
|
|
||||||
auto begin_iter = BatchIterator<CSCPage>(
|
|
||||||
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<CSCPage>, CSCPage>(
|
|
||||||
external_prefetcher_.get()));
|
|
||||||
return BatchSet<CSCPage>(begin_iter);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<ExternalMemoryPrefetcher<CSCPage>> external_prefetcher_;
|
|
||||||
CacheInfo cache_info_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class SortedCSCPageSource {
|
class CSCPageSource : public PageSourceIncMixIn<CSCPage> {
|
||||||
|
protected:
|
||||||
|
void Fetch() final {
|
||||||
|
if (!this->ReadCache()) {
|
||||||
|
auto const &csr = source_->Page();
|
||||||
|
this->page_.reset(new CSCPage{});
|
||||||
|
// we might be able to optimize this by merging transpose and pushcsc
|
||||||
|
this->page_->PushCSC(csr->GetTranspose(n_features_));
|
||||||
|
page_->SetBaseRowId(csr->base_rowid);
|
||||||
|
this->WriteCache();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SortedCSCPageSource(DMatrix* src, const std::string& cache_info,
|
CSCPageSource(
|
||||||
const size_t page_size = DMatrix::kPageSize) {
|
float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
|
||||||
std::string page_type = ".sorted.col.page";
|
std::shared_ptr<Cache> cache,
|
||||||
cache_info_ = ParseCacheInfo(cache_info, page_type);
|
std::shared_ptr<SparsePageSource> source)
|
||||||
for (auto file : cache_info_.name_shards) {
|
: PageSourceIncMixIn(missing, nthreads, n_features,
|
||||||
CheckCacheFileExists(file);
|
n_batches, cache) {
|
||||||
|
this->source_ = source;
|
||||||
|
this->Fetch();
|
||||||
}
|
}
|
||||||
{
|
|
||||||
SparsePageWriter<SparsePage> writer(cache_info_.name_shards,
|
|
||||||
cache_info_.format_shards, 6);
|
|
||||||
std::shared_ptr<SparsePage> page;
|
|
||||||
writer.Alloc(&page);
|
|
||||||
page->Clear();
|
|
||||||
|
|
||||||
size_t bytes_write = 0;
|
|
||||||
double tstart = dmlc::GetTime();
|
|
||||||
for (auto& batch : src->GetBatches<SparsePage>()) {
|
|
||||||
SparsePage tmp = batch.GetTranspose(src->Info().num_col_);
|
|
||||||
page->PushCSC(tmp);
|
|
||||||
page->SortRows();
|
|
||||||
|
|
||||||
if (page->MemCostBytes() >= page_size) {
|
|
||||||
bytes_write += page->MemCostBytes();
|
|
||||||
writer.PushWrite(std::move(page));
|
|
||||||
writer.Alloc(&page);
|
|
||||||
page->Clear();
|
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
|
||||||
LOG(INFO) << "Writing to " << cache_info << " in "
|
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
|
||||||
<< (bytes_write >> 20UL) << " written";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (page->data.Size() != 0) {
|
|
||||||
writer.PushWrite(std::move(page));
|
|
||||||
}
|
|
||||||
LOG(INFO) << "SortedCSCPageSource: Finished writing to "
|
|
||||||
<< cache_info_.name_info;
|
|
||||||
}
|
|
||||||
external_prefetcher_.reset(
|
|
||||||
new ExternalMemoryPrefetcher<SortedCSCPage>(cache_info_));
|
|
||||||
}
|
|
||||||
~SortedCSCPageSource() {
|
|
||||||
external_prefetcher_.reset();
|
|
||||||
for (auto file : cache_info_.name_shards) {
|
|
||||||
TryDeleteCacheFile(file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BatchSet<SortedCSCPage> GetBatchSet() {
|
|
||||||
auto begin_iter = BatchIterator<SortedCSCPage>(
|
|
||||||
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<SortedCSCPage>,
|
|
||||||
SortedCSCPage>(external_prefetcher_.get()));
|
|
||||||
return BatchSet<SortedCSCPage>(begin_iter);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<ExternalMemoryPrefetcher<SortedCSCPage>> external_prefetcher_;
|
|
||||||
CacheInfo cache_info_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
|
||||||
|
protected:
|
||||||
|
void Fetch() final {
|
||||||
|
if (!this->ReadCache()) {
|
||||||
|
auto const &csr = this->source_->Page();
|
||||||
|
this->page_.reset(new SortedCSCPage{});
|
||||||
|
// we might be able to optimize this by merging transpose and pushcsc
|
||||||
|
this->page_->PushCSC(csr->GetTranspose(n_features_));
|
||||||
|
CHECK_EQ(this->page_->Size(), n_features_);
|
||||||
|
CHECK_EQ(this->page_->data.Size(), csr->data.Size());
|
||||||
|
this->page_->SortRows();
|
||||||
|
page_->SetBaseRowId(csr->base_rowid);
|
||||||
|
this->WriteCache();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
SortedCSCPageSource(float missing, int nthreads, bst_feature_t n_features,
|
||||||
|
uint32_t n_batches, std::shared_ptr<Cache> cache,
|
||||||
|
std::shared_ptr<SparsePageSource> source)
|
||||||
|
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache) {
|
||||||
|
this->source_ = source;
|
||||||
|
this->Fetch();
|
||||||
|
}
|
||||||
|
};
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
#endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||||
|
|||||||
@ -63,103 +63,6 @@ inline SparsePageFormat<T>* CreatePageFormat(const std::string& name) {
|
|||||||
return (e->body)();
|
return (e->body)();
|
||||||
}
|
}
|
||||||
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
|
||||||
/*!
|
|
||||||
* \brief A threaded writer to write sparse batch page to sharded files.
|
|
||||||
* @tparam T Type of the page.
|
|
||||||
*/
|
|
||||||
template<typename T>
|
|
||||||
class SparsePageWriter {
|
|
||||||
public:
|
|
||||||
/*!
|
|
||||||
* \brief constructor
|
|
||||||
* \param name_shards name of shard files.
|
|
||||||
* \param format_shards format of each shard.
|
|
||||||
* \param extra_buffer_capacity Extra buffer capacity before block.
|
|
||||||
*/
|
|
||||||
explicit SparsePageWriter(const std::vector<std::string>& name_shards,
|
|
||||||
const std::vector<std::string>& format_shards,
|
|
||||||
size_t extra_buffer_capacity)
|
|
||||||
: num_free_buffer_(extra_buffer_capacity + name_shards.size()),
|
|
||||||
clock_ptr_(0),
|
|
||||||
workers_(name_shards.size()),
|
|
||||||
qworkers_(name_shards.size()) {
|
|
||||||
CHECK_EQ(name_shards.size(), format_shards.size());
|
|
||||||
// start writer threads
|
|
||||||
for (size_t i = 0; i < name_shards.size(); ++i) {
|
|
||||||
std::string name_shard = name_shards[i];
|
|
||||||
std::string format_shard = format_shards[i];
|
|
||||||
auto* wqueue = &qworkers_[i];
|
|
||||||
workers_[i].reset(new std::thread(
|
|
||||||
[this, name_shard, format_shard, wqueue]() {
|
|
||||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(name_shard.c_str(), "w"));
|
|
||||||
std::unique_ptr<SparsePageFormat<T>> fmt(CreatePageFormat<T>(format_shard));
|
|
||||||
fo->Write(format_shard);
|
|
||||||
std::shared_ptr<T> page;
|
|
||||||
while (wqueue->Pop(&page)) {
|
|
||||||
if (page == nullptr) break;
|
|
||||||
fmt->Write(*page, fo.get());
|
|
||||||
qrecycle_.Push(std::move(page));
|
|
||||||
}
|
|
||||||
fo.reset(nullptr);
|
|
||||||
LOG(INFO) << "SparsePageWriter Finished writing to " << name_shard;
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! \brief destructor, will close the files automatically */
|
|
||||||
~SparsePageWriter() {
|
|
||||||
for (auto& queue : qworkers_) {
|
|
||||||
// use nullptr to signal termination.
|
|
||||||
std::shared_ptr<T> sig(nullptr);
|
|
||||||
queue.Push(std::move(sig));
|
|
||||||
}
|
|
||||||
for (auto& thread : workers_) {
|
|
||||||
thread->join();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Push a write job to the writer.
|
|
||||||
* This function won't block,
|
|
||||||
* writing is done by another thread inside writer.
|
|
||||||
* \param page The page to be written
|
|
||||||
*/
|
|
||||||
void PushWrite(std::shared_ptr<T>&& page) {
|
|
||||||
qworkers_[clock_ptr_].Push(std::move(page));
|
|
||||||
clock_ptr_ = (clock_ptr_ + 1) % workers_.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Allocate a page to store results.
|
|
||||||
* This function can block when the writer is too slow and buffer pages
|
|
||||||
* have not yet been recycled.
|
|
||||||
* \param out_page Used to store the allocated pages.
|
|
||||||
*/
|
|
||||||
void Alloc(std::shared_ptr<T>* out_page) {
|
|
||||||
CHECK(*out_page == nullptr);
|
|
||||||
if (num_free_buffer_ != 0) {
|
|
||||||
out_page->reset(new T());
|
|
||||||
--num_free_buffer_;
|
|
||||||
} else {
|
|
||||||
CHECK(qrecycle_.Pop(out_page));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
/*! \brief number of allocated pages */
|
|
||||||
size_t num_free_buffer_;
|
|
||||||
/*! \brief clock_pointer */
|
|
||||||
size_t clock_ptr_;
|
|
||||||
/*! \brief writer threads */
|
|
||||||
std::vector<std::unique_ptr<std::thread>> workers_;
|
|
||||||
/*! \brief recycler queue */
|
|
||||||
dmlc::ConcurrentBlockingQueue<std::shared_ptr<T>> qrecycle_;
|
|
||||||
/*! \brief worker threads */
|
|
||||||
std::vector<dmlc::ConcurrentBlockingQueue<std::shared_ptr<T>>> qworkers_;
|
|
||||||
};
|
|
||||||
#endif // DMLC_ENABLE_STD_THREAD
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Registry entry for sparse page format.
|
* \brief Registry entry for sparse page format.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -131,7 +131,7 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
|||||||
if (rnd_(i) <= p) {
|
if (rnd_(i) <= p) {
|
||||||
return gpair / p;
|
return gpair / p;
|
||||||
} else {
|
} else {
|
||||||
return GradientPair();
|
return {};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -143,13 +143,13 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
|||||||
CombineGradientPair combine_;
|
CombineGradientPair combine_;
|
||||||
};
|
};
|
||||||
|
|
||||||
NoSampling::NoSampling(EllpackPageImpl* page) : page_(page) {}
|
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
|
||||||
|
|
||||||
GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
|
GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
|
||||||
return {dmat->Info().num_row_, page_, gpair};
|
return {dmat->Info().num_row_, page_, gpair};
|
||||||
}
|
}
|
||||||
|
|
||||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
|
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param)
|
const BatchParam& batch_param)
|
||||||
: batch_param_(batch_param),
|
: batch_param_(batch_param),
|
||||||
@ -171,7 +171,7 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
|
|||||||
return {dmat->Info().num_row_, page_.get(), gpair};
|
return {dmat->Info().num_row_, page_.get(), gpair};
|
||||||
}
|
}
|
||||||
|
|
||||||
UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample)
|
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
|
||||||
: page_(page), subsample_(subsample) {}
|
: page_(page), subsample_(subsample) {}
|
||||||
|
|
||||||
GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
|
GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
|
||||||
@ -183,7 +183,7 @@ GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DM
|
|||||||
return {dmat->Info().num_row_, page_, gpair};
|
return {dmat->Info().num_row_, page_, gpair};
|
||||||
}
|
}
|
||||||
|
|
||||||
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page,
|
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample)
|
float subsample)
|
||||||
@ -231,7 +231,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
|
|||||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||||
}
|
}
|
||||||
|
|
||||||
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl* page,
|
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam&,
|
const BatchParam&,
|
||||||
float subsample)
|
float subsample)
|
||||||
@ -257,7 +257,7 @@ GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpa
|
|||||||
}
|
}
|
||||||
|
|
||||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
|
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
|
||||||
EllpackPageImpl* page,
|
EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample)
|
float subsample)
|
||||||
@ -313,7 +313,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
|
|||||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||||
}
|
}
|
||||||
|
|
||||||
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
|
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample,
|
float subsample,
|
||||||
|
|||||||
@ -16,7 +16,7 @@ struct GradientBasedSample {
|
|||||||
/*!\brief Number of sampled rows. */
|
/*!\brief Number of sampled rows. */
|
||||||
size_t sample_rows;
|
size_t sample_rows;
|
||||||
/*!\brief Sampled rows in ELLPACK format. */
|
/*!\brief Sampled rows in ELLPACK format. */
|
||||||
EllpackPageImpl* page;
|
EllpackPageImpl const* page;
|
||||||
/*!\brief Gradient pairs for the sampled rows. */
|
/*!\brief Gradient pairs for the sampled rows. */
|
||||||
common::Span<GradientPair> gpair;
|
common::Span<GradientPair> gpair;
|
||||||
};
|
};
|
||||||
@ -31,17 +31,17 @@ class SamplingStrategy {
|
|||||||
/*! \brief No sampling in in-memory mode. */
|
/*! \brief No sampling in in-memory mode. */
|
||||||
class NoSampling : public SamplingStrategy {
|
class NoSampling : public SamplingStrategy {
|
||||||
public:
|
public:
|
||||||
explicit NoSampling(EllpackPageImpl* page);
|
explicit NoSampling(EllpackPageImpl const* page);
|
||||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
EllpackPageImpl* page_;
|
EllpackPageImpl const* page_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief No sampling in external memory mode. */
|
/*! \brief No sampling in external memory mode. */
|
||||||
class ExternalMemoryNoSampling : public SamplingStrategy {
|
class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||||
public:
|
public:
|
||||||
ExternalMemoryNoSampling(EllpackPageImpl* page,
|
ExternalMemoryNoSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param);
|
const BatchParam& batch_param);
|
||||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||||
@ -55,25 +55,25 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
|
|||||||
/*! \brief Uniform sampling in in-memory mode. */
|
/*! \brief Uniform sampling in in-memory mode. */
|
||||||
class UniformSampling : public SamplingStrategy {
|
class UniformSampling : public SamplingStrategy {
|
||||||
public:
|
public:
|
||||||
UniformSampling(EllpackPageImpl* page, float subsample);
|
UniformSampling(EllpackPageImpl const* page, float subsample);
|
||||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
EllpackPageImpl* page_;
|
EllpackPageImpl const* page_;
|
||||||
float subsample_;
|
float subsample_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief No sampling in external memory mode. */
|
/*! \brief No sampling in external memory mode. */
|
||||||
class ExternalMemoryUniformSampling : public SamplingStrategy {
|
class ExternalMemoryUniformSampling : public SamplingStrategy {
|
||||||
public:
|
public:
|
||||||
ExternalMemoryUniformSampling(EllpackPageImpl* page,
|
ExternalMemoryUniformSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample);
|
float subsample);
|
||||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
EllpackPageImpl* original_page_;
|
EllpackPageImpl const* original_page_;
|
||||||
BatchParam batch_param_;
|
BatchParam batch_param_;
|
||||||
float subsample_;
|
float subsample_;
|
||||||
std::unique_ptr<EllpackPageImpl> page_;
|
std::unique_ptr<EllpackPageImpl> page_;
|
||||||
@ -84,14 +84,14 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
|
|||||||
/*! \brief Gradient-based sampling in in-memory mode.. */
|
/*! \brief Gradient-based sampling in in-memory mode.. */
|
||||||
class GradientBasedSampling : public SamplingStrategy {
|
class GradientBasedSampling : public SamplingStrategy {
|
||||||
public:
|
public:
|
||||||
GradientBasedSampling(EllpackPageImpl* page,
|
GradientBasedSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample);
|
float subsample);
|
||||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
EllpackPageImpl* page_;
|
EllpackPageImpl const* page_;
|
||||||
float subsample_;
|
float subsample_;
|
||||||
dh::caching_device_vector<float> threshold_;
|
dh::caching_device_vector<float> threshold_;
|
||||||
dh::caching_device_vector<float> grad_sum_;
|
dh::caching_device_vector<float> grad_sum_;
|
||||||
@ -100,14 +100,14 @@ class GradientBasedSampling : public SamplingStrategy {
|
|||||||
/*! \brief Gradient-based sampling in external memory mode.. */
|
/*! \brief Gradient-based sampling in external memory mode.. */
|
||||||
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||||
public:
|
public:
|
||||||
ExternalMemoryGradientBasedSampling(EllpackPageImpl* page,
|
ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample);
|
float subsample);
|
||||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
EllpackPageImpl* original_page_;
|
EllpackPageImpl const* original_page_;
|
||||||
BatchParam batch_param_;
|
BatchParam batch_param_;
|
||||||
float subsample_;
|
float subsample_;
|
||||||
dh::caching_device_vector<float> threshold_;
|
dh::caching_device_vector<float> threshold_;
|
||||||
@ -128,7 +128,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
|||||||
*/
|
*/
|
||||||
class GradientBasedSampler {
|
class GradientBasedSampler {
|
||||||
public:
|
public:
|
||||||
GradientBasedSampler(EllpackPageImpl* page,
|
GradientBasedSampler(EllpackPageImpl const* page,
|
||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param,
|
const BatchParam& batch_param,
|
||||||
float subsample,
|
float subsample,
|
||||||
|
|||||||
@ -163,7 +163,7 @@ class DeviceHistogram {
|
|||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
struct GPUHistMakerDevice {
|
struct GPUHistMakerDevice {
|
||||||
int device_id;
|
int device_id;
|
||||||
EllpackPageImpl* page;
|
EllpackPageImpl const* page;
|
||||||
common::Span<FeatureType const> feature_types;
|
common::Span<FeatureType const> feature_types;
|
||||||
BatchParam batch_param;
|
BatchParam batch_param;
|
||||||
|
|
||||||
@ -199,7 +199,7 @@ struct GPUHistMakerDevice {
|
|||||||
dh::caching_device_vector<uint32_t> node_categories;
|
dh::caching_device_vector<uint32_t> node_categories;
|
||||||
|
|
||||||
GPUHistMakerDevice(int _device_id,
|
GPUHistMakerDevice(int _device_id,
|
||||||
EllpackPageImpl* _page,
|
EllpackPageImpl const* _page,
|
||||||
common::Span<FeatureType const> _feature_types,
|
common::Span<FeatureType const> _feature_types,
|
||||||
bst_uint _n_rows,
|
bst_uint _n_rows,
|
||||||
TrainParam _param,
|
TrainParam _param,
|
||||||
@ -488,7 +488,7 @@ struct GPUHistMakerDevice {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void FinalisePositionInPage(EllpackPageImpl *page,
|
void FinalisePositionInPage(EllpackPageImpl const *page,
|
||||||
const common::Span<RegTree::Node> d_nodes,
|
const common::Span<RegTree::Node> d_nodes,
|
||||||
common::Span<FeatureType const> d_feature_types,
|
common::Span<FeatureType const> d_feature_types,
|
||||||
common::Span<uint32_t const> categories,
|
common::Span<uint32_t const> categories,
|
||||||
@ -812,7 +812,6 @@ class GPUHistMakerSpecialised {
|
|||||||
BatchParam batch_param{
|
BatchParam batch_param{
|
||||||
device_,
|
device_,
|
||||||
param_.max_bin,
|
param_.max_bin,
|
||||||
generic_param_->gpu_page_size
|
|
||||||
};
|
};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
|||||||
@ -125,12 +125,10 @@ TEST(DenseColumnWithMissing, Test) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void TestGHistIndexMatrixCreation(size_t nthreads) {
|
void TestGHistIndexMatrixCreation(size_t nthreads) {
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
|
||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
|
||||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||||
/* This should create multiple sparse pages */
|
/* This should create multiple sparse pages */
|
||||||
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries, kPageSize, filename) };
|
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries) };
|
||||||
omp_set_num_threads(nthreads);
|
omp_set_num_threads(nthreads);
|
||||||
GHistIndexMatrix gmat(dmat.get(), 256);
|
GHistIndexMatrix gmat(dmat.get(), 256);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -83,7 +83,7 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
|
|||||||
}
|
}
|
||||||
fo.close();
|
fo.close();
|
||||||
return std::shared_ptr<DMatrix>(DMatrix::Load(
|
return std::shared_ptr<DMatrix>(DMatrix::Load(
|
||||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
|
tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test that elements are approximately equally distributed among bins
|
// Test that elements are approximately equally distributed among bins
|
||||||
|
|||||||
@ -59,12 +59,9 @@ TEST(SparsePage, PushCSC) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePage, PushCSCAfterTranspose) {
|
TEST(SparsePage, PushCSCAfterTranspose) {
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
|
||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
|
||||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||||
std::unique_ptr<DMatrix> dmat =
|
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
|
||||||
CreateSparsePageDMatrix(kEntries, 64UL, filename);
|
|
||||||
const int ncols = dmat->Info().num_col_;
|
const int ncols = dmat->Info().num_col_;
|
||||||
SparsePage page; // Consolidated sparse page
|
SparsePage page; // Consolidated sparse page
|
||||||
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
||||||
@ -76,12 +73,12 @@ TEST(SparsePage, PushCSCAfterTranspose) {
|
|||||||
// Make sure that the final sparse page has the right number of entries
|
// Make sure that the final sparse page has the right number of entries
|
||||||
ASSERT_EQ(kEntries, page.data.Size());
|
ASSERT_EQ(kEntries, page.data.Size());
|
||||||
|
|
||||||
// The feature value for a feature in each row should be identical, as that is
|
page.SortRows();
|
||||||
// how the dmatrix has been created
|
auto v = page.GetView();
|
||||||
for (size_t i = 0; i < page.Size(); ++i) {
|
for (size_t i = 0; i < v.Size(); ++i) {
|
||||||
auto inst = page.GetView()[i];
|
auto column = v[i];
|
||||||
for (size_t j = 1; j < inst.size(); ++j) {
|
for (size_t j = 1; j < column.size(); ++j) {
|
||||||
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);
|
ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -142,7 +142,7 @@ TEST(EllpackPage, Copy) {
|
|||||||
dmlc::TemporaryDirectory tmpdir;
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
std::unique_ptr<DMatrix>
|
std::unique_ptr<DMatrix>
|
||||||
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||||
BatchParam param{0, 256, kPageSize};
|
BatchParam param{0, 256};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
|
|
||||||
// Create an empty result page.
|
// Create an empty result page.
|
||||||
@ -188,7 +188,7 @@ TEST(EllpackPage, Compact) {
|
|||||||
dmlc::TemporaryDirectory tmpdir;
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
std::unique_ptr<DMatrix>
|
std::unique_ptr<DMatrix>
|
||||||
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||||
BatchParam param{0, 256, kPageSize};
|
BatchParam param{0, 256};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
|
|
||||||
// Create an empty result page.
|
// Create an empty result page.
|
||||||
@ -212,7 +212,7 @@ TEST(EllpackPage, Compact) {
|
|||||||
std::vector<bst_float> row_result(kCols);
|
std::vector<bst_float> row_result(kCols);
|
||||||
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
||||||
auto impl = page.Impl();
|
auto impl = page.Impl();
|
||||||
EXPECT_EQ(impl->base_rowid, current_row);
|
ASSERT_EQ(impl->base_rowid, current_row);
|
||||||
|
|
||||||
for (size_t i = 0; i < impl->Size(); i++) {
|
for (size_t i = 0; i < impl->Size(); i++) {
|
||||||
size_t compacted_row = row_indexes_h[current_row];
|
size_t compacted_row = row_indexes_h[current_row];
|
||||||
|
|||||||
46
tests/cpp/data/test_file_iterator.cc
Normal file
46
tests/cpp/data/test_file_iterator.cc
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2021 XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <dmlc/filesystem.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "../../../src/data/file_iterator.h"
|
||||||
|
#include "../../../src/data/proxy_dmatrix.h"
|
||||||
|
#include "../../../src/data/adapter.h"
|
||||||
|
#include "../helpers.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace data {
|
||||||
|
TEST(FileIterator, Basic) {
|
||||||
|
auto check_n_features = [](FileIterator *iter) {
|
||||||
|
size_t n_features = 0;
|
||||||
|
iter->Reset();
|
||||||
|
while (iter->Next()) {
|
||||||
|
auto proxy = MakeProxy(iter->Proxy());
|
||||||
|
auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
|
||||||
|
n_features = std::max(n_features, csr->NumColumns());
|
||||||
|
}
|
||||||
|
ASSERT_EQ(n_features, 5);
|
||||||
|
};
|
||||||
|
|
||||||
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
{
|
||||||
|
auto zpath = tmpdir.path + "/0-based.svm";
|
||||||
|
CreateBigTestData(zpath, 3 * 64, true);
|
||||||
|
zpath += "?indexing_mode=0";
|
||||||
|
FileIterator iter{zpath, 0, 1, "libsvm"};
|
||||||
|
check_n_features(&iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto opath = tmpdir.path + "/1-based.svm";
|
||||||
|
CreateBigTestData(opath, 3 * 64, false);
|
||||||
|
opath += "?indexing_mode=1";
|
||||||
|
FileIterator iter{opath, 0, 1, "libsvm"};
|
||||||
|
check_n_features(&iter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace data
|
||||||
|
} // namespace xgboost
|
||||||
@ -142,7 +142,7 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
|
|||||||
IterativeDeviceDMatrix m(
|
IterativeDeviceDMatrix m(
|
||||||
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
|
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
|
||||||
0, 256);
|
0, 256);
|
||||||
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256, 0}).begin();
|
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
|
||||||
auto impl = ellpack.Impl();
|
auto impl = ellpack.Impl();
|
||||||
common::CompressedIterator<uint32_t> iterator(
|
common::CompressedIterator<uint32_t> iterator(
|
||||||
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
||||||
|
|||||||
@ -260,7 +260,7 @@ TEST(MetaInfo, HostExtend) {
|
|||||||
lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
||||||
rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
||||||
|
|
||||||
lhs.Extend(rhs, true);
|
lhs.Extend(rhs, true, true);
|
||||||
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
||||||
ASSERT_TRUE(lhs.labels_.HostCanRead());
|
ASSERT_TRUE(lhs.labels_.HostCanRead());
|
||||||
ASSERT_TRUE(rhs.labels_.HostCanRead());
|
ASSERT_TRUE(rhs.labels_.HostCanRead());
|
||||||
|
|||||||
@ -141,7 +141,7 @@ TEST(MetaInfo, DeviceExtend) {
|
|||||||
lhs.num_row_ = kRows;
|
lhs.num_row_ = kRows;
|
||||||
rhs.num_row_ = kRows;
|
rhs.num_row_ = kRows;
|
||||||
|
|
||||||
lhs.Extend(rhs, true);
|
lhs.Extend(rhs, true, true);
|
||||||
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
||||||
ASSERT_FALSE(lhs.labels_.HostCanRead());
|
ASSERT_FALSE(lhs.labels_.HostCanRead());
|
||||||
|
|
||||||
|
|||||||
@ -6,11 +6,100 @@
|
|||||||
#include <future>
|
#include <future>
|
||||||
#include "../../../src/common/io.h"
|
#include "../../../src/common/io.h"
|
||||||
#include "../../../src/data/adapter.h"
|
#include "../../../src/data/adapter.h"
|
||||||
|
#include "../../../src/data/simple_dmatrix.h"
|
||||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||||
|
#include "../../../src/data/file_iterator.h"
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|
||||||
using namespace xgboost; // NOLINT
|
using namespace xgboost; // NOLINT
|
||||||
|
|
||||||
|
template <typename Page>
|
||||||
|
void TestSparseDMatrixLoadFile() {
|
||||||
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
auto opath = tmpdir.path + "/1-based.svm";
|
||||||
|
CreateBigTestData(opath, 3 * 64, false);
|
||||||
|
opath += "?indexing_mode=1";
|
||||||
|
data::FileIterator iter{opath, 0, 1, "libsvm"};
|
||||||
|
data::SparsePageDMatrix m{&iter,
|
||||||
|
iter.Proxy(),
|
||||||
|
data::fileiter::Reset,
|
||||||
|
data::fileiter::Next,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(),
|
||||||
|
1,
|
||||||
|
"cache"};
|
||||||
|
ASSERT_EQ(m.Info().num_col_, 5);
|
||||||
|
ASSERT_EQ(m.Info().num_row_, 64);
|
||||||
|
|
||||||
|
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||||
|
dmlc::Parser<uint32_t>::Create(opath.c_str(), 0, 1, "auto"));
|
||||||
|
auto adapter = data::FileAdapter{parser.get()};
|
||||||
|
|
||||||
|
data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||||
|
1};
|
||||||
|
Page out;
|
||||||
|
for (auto const& page : m.GetBatches<Page>()) {
|
||||||
|
if (std::is_same<Page, SparsePage>::value) {
|
||||||
|
out.Push(page);
|
||||||
|
} else {
|
||||||
|
out.PushCSC(page);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
|
||||||
|
ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
|
||||||
|
|
||||||
|
for (auto const& page : simple.GetBatches<Page>()) {
|
||||||
|
ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
|
||||||
|
for (size_t i = 0; i < page.data.Size(); ++i) {
|
||||||
|
ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SparsePageDMatrix, LoadFile) {
|
||||||
|
TestSparseDMatrixLoadFile<SparsePage>();
|
||||||
|
TestSparseDMatrixLoadFile<CSCPage>();
|
||||||
|
TestSparseDMatrixLoadFile<SortedCSCPage>();
|
||||||
|
}
|
||||||
|
|
||||||
|
// allow caller to retain pages so they can process multiple pages at the same time.
|
||||||
|
template <typename Page>
|
||||||
|
void TestRetainPage() {
|
||||||
|
auto m = CreateSparsePageDMatrix(10000);
|
||||||
|
auto batches = m->GetBatches<Page>();
|
||||||
|
auto begin = batches.begin();
|
||||||
|
auto end = batches.end();
|
||||||
|
|
||||||
|
std::vector<Page> pages;
|
||||||
|
std::vector<std::shared_ptr<Page const>> iterators;
|
||||||
|
for (auto it = begin; it != end; ++it) {
|
||||||
|
iterators.push_back(it.Page());
|
||||||
|
pages.emplace_back(Page{});
|
||||||
|
if (std::is_same<Page, SparsePage>::value) {
|
||||||
|
pages.back().Push(*it);
|
||||||
|
} else {
|
||||||
|
pages.back().PushCSC(*it);
|
||||||
|
}
|
||||||
|
ASSERT_EQ(pages.back().Size(), (*it).Size());
|
||||||
|
}
|
||||||
|
ASSERT_GE(iterators.size(), 2);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < iterators.size(); ++i) {
|
||||||
|
ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
|
||||||
|
ASSERT_EQ((*iterators[i]).data.HostVector(), pages.at(i).data.HostVector());
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure it's const and the caller can not modify the content of page.
|
||||||
|
for (auto& page : m->GetBatches<Page>()) {
|
||||||
|
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SparsePageDMatrix, RetainSparsePage) {
|
||||||
|
TestRetainPage<SparsePage>();
|
||||||
|
TestRetainPage<CSCPage>();
|
||||||
|
TestRetainPage<SortedCSCPage>();
|
||||||
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, MetaInfo) {
|
TEST(SparsePageDMatrix, MetaInfo) {
|
||||||
dmlc::TemporaryDirectory tempdir;
|
dmlc::TemporaryDirectory tempdir;
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||||
@ -19,8 +108,6 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
|||||||
|
|
||||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
|
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
|
||||||
tmp_file + "#" + tmp_file + ".cache", false, false);
|
tmp_file + "#" + tmp_file + ".cache", false, false);
|
||||||
std::cout << tmp_file << std::endl;
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
|
||||||
|
|
||||||
// Test the metadata that was parsed
|
// Test the metadata that was parsed
|
||||||
EXPECT_EQ(dmat->Info().num_row_, 8ul);
|
EXPECT_EQ(dmat->Info().num_row_, 8ul);
|
||||||
@ -32,10 +119,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, RowAccess) {
|
TEST(SparsePageDMatrix, RowAccess) {
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(24);
|
||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
|
||||||
xgboost::CreateSparsePageDMatrix(24, 4, filename);
|
|
||||||
|
|
||||||
// Test the data read into the first row
|
// Test the data read into the first row
|
||||||
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
|
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
|
||||||
@ -43,7 +127,7 @@ TEST(SparsePageDMatrix, RowAccess) {
|
|||||||
auto first_row = page[0];
|
auto first_row = page[0];
|
||||||
ASSERT_EQ(first_row.size(), 3ul);
|
ASSERT_EQ(first_row.size(), 3ul);
|
||||||
EXPECT_EQ(first_row[2].index, 2u);
|
EXPECT_EQ(first_row[2].index, 2u);
|
||||||
EXPECT_EQ(first_row[2].fvalue, 20);
|
EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, ColAccess) {
|
TEST(SparsePageDMatrix, ColAccess) {
|
||||||
@ -54,55 +138,46 @@ TEST(SparsePageDMatrix, ColAccess) {
|
|||||||
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||||
|
|
||||||
// Loop over the batches and assert the data is as expected
|
// Loop over the batches and assert the data is as expected
|
||||||
|
size_t iter = 0;
|
||||||
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
|
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
|
||||||
auto col_page = col_batch.GetView();
|
auto col_page = col_batch.GetView();
|
||||||
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
|
ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
|
||||||
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
|
if (iter == 1) {
|
||||||
EXPECT_EQ(col_page[1].size(), 1);
|
ASSERT_EQ(col_page[0][0].fvalue, 0.f);
|
||||||
|
ASSERT_EQ(col_page[3][0].fvalue, 30.f);
|
||||||
|
ASSERT_EQ(col_page[3][0].index, 1);
|
||||||
|
ASSERT_EQ(col_page[3].size(), 1);
|
||||||
|
} else {
|
||||||
|
ASSERT_EQ(col_page[1][0].fvalue, 10.0f);
|
||||||
|
ASSERT_EQ(col_page[1].size(), 1);
|
||||||
|
}
|
||||||
|
CHECK_LE(col_batch.base_rowid, dmat->Info().num_row_);
|
||||||
|
++iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop over the batches and assert the data is as expected
|
// Loop over the batches and assert the data is as expected
|
||||||
|
iter = 0;
|
||||||
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
|
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||||
auto col_page = col_batch.GetView();
|
auto col_page = col_batch.GetView();
|
||||||
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
|
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
|
||||||
|
if (iter == 0) {
|
||||||
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
|
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
|
||||||
EXPECT_EQ(col_page[1].size(), 1);
|
EXPECT_EQ(col_page[1].size(), 1);
|
||||||
|
} else {
|
||||||
|
EXPECT_EQ(col_page[3][0].fvalue, 30.f);
|
||||||
|
EXPECT_EQ(col_page[3].size(), 1);
|
||||||
|
}
|
||||||
|
iter++;
|
||||||
}
|
}
|
||||||
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
|
||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
|
|
||||||
EXPECT_FALSE(FileExists(tmp_file + ".cache"));
|
|
||||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.row.page"));
|
|
||||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
|
|
||||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, ExistingCacheFile) {
|
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
|
||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
|
||||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
|
||||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
|
||||||
EXPECT_ANY_THROW({
|
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat2 =
|
|
||||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, ThreadSafetyException) {
|
TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
size_t constexpr kEntriesPerCol = 3;
|
||||||
std::string filename = tmpdir.path + "/test";
|
size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
|
||||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
|
||||||
|
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
xgboost::CreateSparsePageDMatrix(kEntries);
|
||||||
|
|
||||||
int threads = 1000;
|
int threads = 1000;
|
||||||
|
|
||||||
@ -134,13 +209,10 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
|
|||||||
|
|
||||||
// Multi-batches access
|
// Multi-batches access
|
||||||
TEST(SparsePageDMatrix, ColAccessBatches) {
|
TEST(SparsePageDMatrix, ColAccessBatches) {
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
|
||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
|
||||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||||
// Create multiple sparse pages
|
// Create multiple sparse pages
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat{
|
std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
|
||||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename)};
|
|
||||||
auto n_threads = omp_get_max_threads();
|
auto n_threads = omp_get_max_threads();
|
||||||
omp_set_num_threads(16);
|
omp_set_num_threads(16);
|
||||||
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
|
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||||
@ -149,234 +221,37 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
|
|||||||
omp_set_num_threads(n_threads);
|
omp_set_num_threads(n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, Empty) {
|
auto TestSparsePageDMatrixDeterminism(int32_t threads) {
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
std::vector<float> data{};
|
|
||||||
std::vector<unsigned> feature_idx = {};
|
|
||||||
std::vector<size_t> row_ptr = {};
|
|
||||||
|
|
||||||
{
|
|
||||||
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(),
|
|
||||||
data.data(), 0, 0, 0);
|
|
||||||
data::SparsePageDMatrix dmat(
|
|
||||||
&csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
|
|
||||||
EXPECT_EQ(dmat.Info().num_row_, 0);
|
|
||||||
EXPECT_EQ(dmat.Info().num_col_, 0);
|
|
||||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
|
||||||
EXPECT_EQ(batch.Size(), 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
data::DenseAdapter dense_adapter(nullptr, 0, 0);
|
|
||||||
data::SparsePageDMatrix dmat2(
|
|
||||||
&dense_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
|
|
||||||
EXPECT_EQ(dmat2.Info().num_row_, 0);
|
|
||||||
EXPECT_EQ(dmat2.Info().num_col_, 0);
|
|
||||||
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
|
|
||||||
EXPECT_EQ(batch.Size(), 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
{
|
|
||||||
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
|
|
||||||
data::SparsePageDMatrix dmat3(
|
|
||||||
&csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
|
|
||||||
EXPECT_EQ(dmat3.Info().num_row_, 0);
|
|
||||||
EXPECT_EQ(dmat3.Info().num_col_, 0);
|
|
||||||
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
|
|
||||||
EXPECT_EQ(batch.Size(), 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, MissingData) {
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
std::vector<float> data{0.0, std::nanf(""), 1.0};
|
|
||||||
std::vector<unsigned> feature_idx = {0, 1, 0};
|
|
||||||
std::vector<size_t> row_ptr = {0, 2, 3};
|
|
||||||
|
|
||||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
|
|
||||||
3, 2);
|
|
||||||
data::SparsePageDMatrix dmat(
|
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
|
|
||||||
|
|
||||||
const std::string tmp_file2 = tempdir.path + "/simple2.libsvm";
|
|
||||||
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1, tmp_file2);
|
|
||||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, EmptyRow) {
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
std::vector<float> data{0.0, 1.0};
|
|
||||||
std::vector<unsigned> feature_idx = {0, 1};
|
|
||||||
std::vector<size_t> row_ptr = {0, 2, 2};
|
|
||||||
|
|
||||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
|
|
||||||
2, 2);
|
|
||||||
data::SparsePageDMatrix dmat(
|
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
|
|
||||||
EXPECT_EQ(dmat.Info().num_row_, 2);
|
|
||||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, FromDense) {
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
int m = 3;
|
|
||||||
int n = 2;
|
|
||||||
std::vector<float> data = {1, 2, 3, 4, 5, 6};
|
|
||||||
data::DenseAdapter adapter(data.data(), m, n);
|
|
||||||
data::SparsePageDMatrix dmat(
|
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
|
||||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
|
||||||
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
|
|
||||||
|
|
||||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
|
||||||
auto page = batch.GetView();
|
|
||||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
|
||||||
auto inst = page[i];
|
|
||||||
for (auto j = 0ull; j < inst.size(); j++) {
|
|
||||||
EXPECT_EQ(inst[j].fvalue, data[i * n + j]);
|
|
||||||
EXPECT_EQ(inst[j].index, j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, FromCSC) {
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
std::vector<float> data = {1, 3, 2, 4, 5};
|
|
||||||
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
|
|
||||||
std::vector<size_t> col_ptr = {0, 2, 5};
|
|
||||||
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
|
|
||||||
data::SparsePageDMatrix dmat(
|
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file);
|
|
||||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
|
||||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
|
||||||
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
|
|
||||||
|
|
||||||
auto &batch = *dmat.GetBatches<SparsePage>().begin();
|
|
||||||
auto page = batch.GetView();
|
|
||||||
auto inst = page[0];
|
|
||||||
EXPECT_EQ(inst[0].fvalue, 1);
|
|
||||||
EXPECT_EQ(inst[0].index, 0);
|
|
||||||
EXPECT_EQ(inst[1].fvalue, 2);
|
|
||||||
EXPECT_EQ(inst[1].index, 1);
|
|
||||||
|
|
||||||
inst = page[1];
|
|
||||||
EXPECT_EQ(inst[0].fvalue, 3);
|
|
||||||
EXPECT_EQ(inst[0].index, 0);
|
|
||||||
EXPECT_EQ(inst[1].fvalue, 4);
|
|
||||||
EXPECT_EQ(inst[1].index, 1);
|
|
||||||
|
|
||||||
inst = page[2];
|
|
||||||
EXPECT_EQ(inst[0].fvalue, 5);
|
|
||||||
EXPECT_EQ(inst[0].index, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, FromFile) {
|
|
||||||
std::string filename = "test.libsvm";
|
|
||||||
CreateBigTestData(filename, 20);
|
|
||||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
|
||||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
|
||||||
data::FileAdapter adapter(parser.get());
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
|
|
||||||
data::SparsePageDMatrix dmat(
|
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1);
|
|
||||||
ASSERT_EQ(dmat.Info().num_col_, 5);
|
|
||||||
|
|
||||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
|
||||||
std::vector<bst_row_t> expected_offset(batch.Size() + 1);
|
|
||||||
auto page = batch.GetView();
|
|
||||||
int n = -3;
|
|
||||||
std::generate(expected_offset.begin(), expected_offset.end(),
|
|
||||||
[&n] { return n += 3; });
|
|
||||||
EXPECT_EQ(batch.offset.HostVector(), expected_offset);
|
|
||||||
|
|
||||||
if (batch.base_rowid % 2 == 0) {
|
|
||||||
EXPECT_EQ(page[0][0].index, 0);
|
|
||||||
EXPECT_EQ(page[0][1].index, 1);
|
|
||||||
EXPECT_EQ(page[0][2].index, 2);
|
|
||||||
} else {
|
|
||||||
EXPECT_EQ(page[0][0].index, 0);
|
|
||||||
EXPECT_EQ(page[0][1].index, 3);
|
|
||||||
EXPECT_EQ(page[0][2].index, 4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, Large) {
|
|
||||||
std::string filename = "test.libsvm";
|
|
||||||
CreateBigTestData(filename, 1 << 16);
|
|
||||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
|
||||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
|
||||||
data::FileAdapter adapter(parser.get());
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
|
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 16)};
|
|
||||||
std::unique_ptr<DMatrix> simple{DMatrix::Load(filename, true, true)};
|
|
||||||
|
|
||||||
std::vector<float> sparse_data;
|
|
||||||
std::vector<size_t> sparse_rptr;
|
|
||||||
std::vector<bst_feature_t> sparse_cids;
|
|
||||||
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
|
|
||||||
|
|
||||||
std::vector<float> simple_data;
|
|
||||||
std::vector<size_t> simple_rptr;
|
|
||||||
std::vector<bst_feature_t> simple_cids;
|
|
||||||
DMatrixToCSR(simple.get(), &simple_data, &simple_rptr, &simple_cids);
|
|
||||||
|
|
||||||
ASSERT_EQ(sparse_rptr.size(), sparse->Info().num_row_ + 1);
|
|
||||||
ASSERT_EQ(sparse_rptr.size(), simple->Info().num_row_ + 1);
|
|
||||||
|
|
||||||
ASSERT_EQ(sparse_data.size(), simple_data.size());
|
|
||||||
ASSERT_EQ(sparse_data, simple_data);
|
|
||||||
ASSERT_EQ(sparse_rptr.size(), simple_rptr.size());
|
|
||||||
ASSERT_EQ(sparse_rptr, simple_rptr);
|
|
||||||
ASSERT_EQ(sparse_cids, simple_cids);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto TestSparsePageDMatrixDeterminism(int32_t threads, std::string const& filename) {
|
|
||||||
omp_set_num_threads(threads);
|
omp_set_num_threads(threads);
|
||||||
std::vector<float> sparse_data;
|
std::vector<float> sparse_data;
|
||||||
std::vector<size_t> sparse_rptr;
|
std::vector<size_t> sparse_rptr;
|
||||||
std::vector<bst_feature_t> sparse_cids;
|
std::vector<bst_feature_t> sparse_cids;
|
||||||
|
|
||||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
|
||||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
|
||||||
data::FileAdapter adapter(parser.get());
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
dmlc::TemporaryDirectory tempdir;
|
||||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
std::string filename = tempdir.path + "/simple.libsvm";
|
||||||
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
|
CreateBigTestData(filename, 1 << 16);
|
||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1 << 8)};
|
|
||||||
|
data::FileIterator iter(filename, 0, 1, "auto");
|
||||||
|
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
|
||||||
|
&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), 1, filename}};
|
||||||
|
|
||||||
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
|
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
|
||||||
|
|
||||||
std::string cache_name = tmp_file + ".row.page";
|
auto cache_name =
|
||||||
|
data::MakeId(filename,
|
||||||
|
dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) +
|
||||||
|
".row.page";
|
||||||
std::string cache = common::LoadSequentialFile(cache_name);
|
std::string cache = common::LoadSequentialFile(cache_name);
|
||||||
return cache;
|
return cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, Determinism) {
|
TEST(SparsePageDMatrix, Determinism) {
|
||||||
std::string filename = "test.libsvm";
|
#if defined(_MSC_VER)
|
||||||
CreateBigTestData(filename, 1 << 16);
|
return;
|
||||||
|
#endif // defined(_MSC_VER)
|
||||||
std::vector<std::string> caches;
|
std::vector<std::string> caches;
|
||||||
for (size_t i = 1; i < 18; i += 2) {
|
for (size_t i = 1; i < 18; i += 2) {
|
||||||
caches.emplace_back(TestSparsePageDMatrixDeterminism(i, filename));
|
caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 1; i < caches.size(); ++i) {
|
for (size_t i = 1; i < caches.size(); ++i) {
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "../../../src/common/compressed_iterator.h"
|
#include "../../../src/common/compressed_iterator.h"
|
||||||
#include "../../../src/data/ellpack_page.cuh"
|
#include "../../../src/data/ellpack_page.cuh"
|
||||||
|
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
@ -14,13 +15,22 @@ TEST(SparsePageDMatrix, EllpackPage) {
|
|||||||
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||||
|
|
||||||
// Loop over the batches and assert the data is as expected
|
// Loop over the batches and assert the data is as expected
|
||||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 64})) {
|
size_t n = 0;
|
||||||
EXPECT_EQ(batch.Size(), dmat->Info().num_row_);
|
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
|
||||||
|
n += batch.Size();
|
||||||
}
|
}
|
||||||
|
EXPECT_EQ(n, dmat->Info().num_row_);
|
||||||
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
auto path =
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
data::MakeId(tmp_file + ".cache",
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page"));
|
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
|
||||||
|
".row.page";
|
||||||
|
EXPECT_TRUE(FileExists(path));
|
||||||
|
path =
|
||||||
|
data::MakeId(tmp_file + ".cache",
|
||||||
|
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
|
||||||
|
".ellpack.page";
|
||||||
|
EXPECT_TRUE(FileExists(path));
|
||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
}
|
}
|
||||||
@ -30,12 +40,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
|||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, filename);
|
||||||
|
|
||||||
// Loop over the batches and count the records
|
// Loop over the batches and count the records
|
||||||
int64_t batch_count = 0;
|
int64_t batch_count = 0;
|
||||||
int64_t row_count = 0;
|
int64_t row_count = 0;
|
||||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 7UL})) {
|
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
|
||||||
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
|
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
|
||||||
batch_count++;
|
batch_count++;
|
||||||
row_count += batch.Size();
|
row_count += batch.Size();
|
||||||
@ -43,7 +53,36 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
|||||||
EXPECT_GE(batch_count, 2);
|
EXPECT_GE(batch_count, 2);
|
||||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||||
|
|
||||||
EXPECT_TRUE(FileExists(filename + ".cache.ellpack.page"));
|
auto path =
|
||||||
|
data::MakeId(filename,
|
||||||
|
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
|
||||||
|
".ellpack.page";
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SparsePageDMatrix, RetainEllpackPage) {
|
||||||
|
auto m = CreateSparsePageDMatrix(10000);
|
||||||
|
auto batches = m->GetBatches<EllpackPage>({0, 32});
|
||||||
|
auto begin = batches.begin();
|
||||||
|
auto end = batches.end();
|
||||||
|
|
||||||
|
std::vector<HostDeviceVector<common::CompressedByteT>> gidx_buffers;
|
||||||
|
std::vector<std::shared_ptr<EllpackPage const>> iterators;
|
||||||
|
for (auto it = begin; it != end; ++it) {
|
||||||
|
iterators.push_back(it.Page());
|
||||||
|
gidx_buffers.emplace_back(HostDeviceVector<common::CompressedByteT>{});
|
||||||
|
gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
|
||||||
|
gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
|
||||||
|
}
|
||||||
|
ASSERT_GE(iterators.size(), 2);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < iterators.size(); ++i) {
|
||||||
|
ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure it's const and the caller can not modify the content of page.
|
||||||
|
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
|
||||||
|
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, EllpackPageContent) {
|
TEST(SparsePageDMatrix, EllpackPageContent) {
|
||||||
@ -59,7 +98,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
|
|||||||
std::unique_ptr<DMatrix>
|
std::unique_ptr<DMatrix>
|
||||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||||
|
|
||||||
BatchParam param{0, 2, 0};
|
BatchParam param{0, 2};
|
||||||
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_EQ(impl->base_rowid, 0);
|
EXPECT_EQ(impl->base_rowid, 0);
|
||||||
EXPECT_EQ(impl->n_rows, kRows);
|
EXPECT_EQ(impl->n_rows, kRows);
|
||||||
@ -67,7 +106,17 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
|
|||||||
EXPECT_EQ(impl->row_stride, 2);
|
EXPECT_EQ(impl->row_stride, 2);
|
||||||
EXPECT_EQ(impl->Cuts().TotalBins(), 4);
|
EXPECT_EQ(impl->Cuts().TotalBins(), 4);
|
||||||
|
|
||||||
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
|
std::unique_ptr<EllpackPageImpl> impl_ext;
|
||||||
|
size_t offset = 0;
|
||||||
|
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
|
||||||
|
if (!impl_ext) {
|
||||||
|
impl_ext.reset(new EllpackPageImpl(
|
||||||
|
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
|
||||||
|
batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
|
||||||
|
}
|
||||||
|
auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
|
||||||
|
offset += n_elems;
|
||||||
|
}
|
||||||
EXPECT_EQ(impl_ext->base_rowid, 0);
|
EXPECT_EQ(impl_ext->base_rowid, 0);
|
||||||
EXPECT_EQ(impl_ext->n_rows, kRows);
|
EXPECT_EQ(impl_ext->n_rows, kRows);
|
||||||
EXPECT_FALSE(impl_ext->is_dense);
|
EXPECT_FALSE(impl_ext->is_dense);
|
||||||
@ -109,7 +158,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
|
|||||||
std::unique_ptr<DMatrix>
|
std::unique_ptr<DMatrix>
|
||||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||||
|
|
||||||
BatchParam param{0, kMaxBins, kPageSize};
|
BatchParam param{0, kMaxBins};
|
||||||
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_EQ(impl->base_rowid, 0);
|
EXPECT_EQ(impl->base_rowid, 0);
|
||||||
EXPECT_EQ(impl->n_rows, kRows);
|
EXPECT_EQ(impl->n_rows, kRows);
|
||||||
@ -150,7 +199,7 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
|
|||||||
std::unique_ptr<DMatrix>
|
std::unique_ptr<DMatrix>
|
||||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||||
|
|
||||||
BatchParam param{0, kMaxBins, kPageSize};
|
BatchParam param{0, kMaxBins};
|
||||||
|
|
||||||
size_t current_row = 0;
|
size_t current_row = 0;
|
||||||
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
||||||
|
|||||||
@ -155,7 +155,8 @@ TEST(GBTree, ChoosePredictor) {
|
|||||||
ASSERT_TRUE(data.HostCanWrite());
|
ASSERT_TRUE(data.HostCanWrite());
|
||||||
|
|
||||||
// pull data into device.
|
// pull data into device.
|
||||||
data = HostDeviceVector<Entry>(data.HostVector(), 0);
|
data.HostVector();
|
||||||
|
data.SetDevice(0);
|
||||||
data.DeviceSpan();
|
data.DeviceSpan();
|
||||||
ASSERT_FALSE(data.HostCanWrite());
|
ASSERT_FALSE(data.HostCanWrite());
|
||||||
|
|
||||||
|
|||||||
@ -18,6 +18,7 @@
|
|||||||
#include "xgboost/c_api.h"
|
#include "xgboost/c_api.h"
|
||||||
#include "../../src/data/adapter.h"
|
#include "../../src/data/adapter.h"
|
||||||
#include "../../src/data/simple_dmatrix.h"
|
#include "../../src/data/simple_dmatrix.h"
|
||||||
|
#include "../../src/data/sparse_page_dmatrix.h"
|
||||||
#include "../../src/gbm/gbtree_model.h"
|
#include "../../src/gbm/gbtree_model.h"
|
||||||
#include "xgboost/predictor.h"
|
#include "xgboost/predictor.h"
|
||||||
|
|
||||||
@ -45,12 +46,25 @@ void CreateSimpleTestData(const std::string& filename) {
|
|||||||
CreateBigTestData(filename, 6);
|
CreateBigTestData(filename, 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CreateBigTestData(const std::string& filename, size_t n_entries) {
|
void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based) {
|
||||||
std::ofstream fo(filename.c_str());
|
std::ofstream fo(filename.c_str());
|
||||||
const size_t entries_per_row = 3;
|
const size_t entries_per_row = 3;
|
||||||
|
std::string odd_row;
|
||||||
|
if (zero_based) {
|
||||||
|
odd_row = " 0:0 3:30 4:40\n";
|
||||||
|
} else {
|
||||||
|
odd_row = " 1:0 4:30 5:40\n";
|
||||||
|
}
|
||||||
|
std::string even_row;
|
||||||
|
if (zero_based) {
|
||||||
|
even_row = " 0:0 1:10 2:20\n";
|
||||||
|
} else {
|
||||||
|
even_row = " 1:0 2:10 3:20\n";
|
||||||
|
}
|
||||||
|
|
||||||
size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row;
|
size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row;
|
||||||
for (size_t i = 0; i < n_rows; ++i) {
|
for (size_t i = 0; i < n_rows; ++i) {
|
||||||
const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n";
|
auto row = i % 2 == 0 ? even_row : odd_row;
|
||||||
fo << i << row;
|
fo << i << row;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -348,13 +362,20 @@ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
|
|||||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1));
|
&adapter, std::numeric_limits<float>::quiet_NaN(), 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries,
|
||||||
size_t n_entries, size_t page_size, std::string tmp_file) {
|
std::string prefix) {
|
||||||
// Create sufficiently large data to make two row pages
|
size_t n_columns = 3;
|
||||||
CreateBigTestData(tmp_file, n_entries);
|
size_t n_rows = n_entries / n_columns;
|
||||||
std::unique_ptr<DMatrix> dmat { DMatrix::Load(
|
ArrayIterForTest iter(0, n_rows, n_columns, 2);
|
||||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
|
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
std::unique_ptr<DMatrix> dmat{DMatrix::Create(
|
||||||
|
static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), 1, prefix)};
|
||||||
|
auto row_page_path =
|
||||||
|
data::MakeId(prefix,
|
||||||
|
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
|
||||||
|
".row.page";
|
||||||
|
EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
|
||||||
|
|
||||||
// Loop over the batches and count the records
|
// Loop over the batches and count the records
|
||||||
int64_t batch_count = 0;
|
int64_t batch_count = 0;
|
||||||
@ -368,7 +389,6 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
|||||||
return dmat;
|
return dmat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
||||||
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
|
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
|
||||||
const dmlc::TemporaryDirectory& tempdir) {
|
const dmlc::TemporaryDirectory& tempdir) {
|
||||||
@ -432,7 +452,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
|||||||
uri += "#" + tmp_file + ".cache";
|
uri += "#" + tmp_file + ".cache";
|
||||||
}
|
}
|
||||||
std::unique_ptr<DMatrix> dmat(
|
std::unique_ptr<DMatrix> dmat(
|
||||||
DMatrix::Load(uri, true, false, "auto", page_size));
|
DMatrix::Load(uri, true, false, "auto"));
|
||||||
return dmat;
|
return dmat;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -481,6 +501,28 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
|
|||||||
return gbm;
|
return gbm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols,
|
||||||
|
size_t batches) : rows_{rows}, cols_{cols}, n_batches_{batches} {
|
||||||
|
XGProxyDMatrixCreate(&proxy_);
|
||||||
|
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
|
||||||
|
std::tie(batches_, interface_) =
|
||||||
|
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
|
||||||
|
|
||||||
|
int ArrayIterForTest::Next() {
|
||||||
|
if (iter_ == n_batches_) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
XGProxyDMatrixSetDataDense(proxy_, batches_[iter_].c_str());
|
||||||
|
iter_++;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t constexpr ArrayIterForTest::kRows;
|
||||||
|
size_t constexpr ArrayIterForTest::kCols;
|
||||||
|
|
||||||
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
||||||
std::vector<size_t> *p_row_ptr,
|
std::vector<size_t> *p_row_ptr,
|
||||||
std::vector<bst_feature_t> *p_cids) {
|
std::vector<bst_feature_t> *p_cids) {
|
||||||
|
|||||||
@ -8,16 +8,16 @@ namespace xgboost {
|
|||||||
|
|
||||||
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
|
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
|
||||||
size_t cols, size_t batches)
|
size_t cols, size_t batches)
|
||||||
: rows_{rows}, cols_{cols}, n_batches_{batches} {
|
: ArrayIterForTest{sparsity, rows, cols, batches} {
|
||||||
XGProxyDMatrixCreate(&proxy_);
|
|
||||||
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
|
|
||||||
rng_->Device(0);
|
rng_->Device(0);
|
||||||
std::tie(batches_, interface_) =
|
std::tie(batches_, interface_) =
|
||||||
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
||||||
this->Reset();
|
this->Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
CudaArrayIterForTest::~CudaArrayIterForTest() { XGDMatrixFree(proxy_); }
|
size_t constexpr CudaArrayIterForTest::kRows;
|
||||||
|
size_t constexpr CudaArrayIterForTest::kCols;
|
||||||
|
size_t constexpr CudaArrayIterForTest::kBatches;
|
||||||
|
|
||||||
int CudaArrayIterForTest::Next() {
|
int CudaArrayIterForTest::Next() {
|
||||||
if (iter_ == n_batches_) {
|
if (iter_ == n_batches_) {
|
||||||
@ -28,8 +28,6 @@ int CudaArrayIterForTest::Next() {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t constexpr CudaArrayIterForTest::kRows;
|
|
||||||
size_t constexpr CudaArrayIterForTest::kCols;
|
|
||||||
|
|
||||||
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label,
|
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label,
|
||||||
bool float_label,
|
bool float_label,
|
||||||
|
|||||||
@ -55,7 +55,9 @@ int64_t GetFileSize(const std::string& filename);
|
|||||||
|
|
||||||
void CreateSimpleTestData(const std::string& filename);
|
void CreateSimpleTestData(const std::string& filename);
|
||||||
|
|
||||||
void CreateBigTestData(const std::string& filename, size_t n_entries);
|
// Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
|
||||||
|
// 0-based indexing.
|
||||||
|
void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
|
||||||
|
|
||||||
void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
||||||
std::vector<xgboost::bst_float> preds,
|
std::vector<xgboost::bst_float> preds,
|
||||||
@ -300,8 +302,7 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
|
|||||||
std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
|
std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
|
||||||
int num_rows, int num_columns);
|
int num_rows, int num_columns);
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
|
||||||
size_t n_entries, size_t page_size, std::string tmp_file);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
||||||
@ -356,7 +357,8 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
|
|||||||
|
|
||||||
typedef void *DMatrixHandle; // NOLINT(*);
|
typedef void *DMatrixHandle; // NOLINT(*);
|
||||||
|
|
||||||
class CudaArrayIterForTest {
|
class ArrayIterForTest {
|
||||||
|
protected:
|
||||||
HostDeviceVector<float> data_;
|
HostDeviceVector<float> data_;
|
||||||
size_t iter_ {0};
|
size_t iter_ {0};
|
||||||
DMatrixHandle proxy_;
|
DMatrixHandle proxy_;
|
||||||
@ -373,20 +375,32 @@ class CudaArrayIterForTest {
|
|||||||
size_t static constexpr kBatches { 100 };
|
size_t static constexpr kBatches { 100 };
|
||||||
size_t static constexpr kCols { 13 };
|
size_t static constexpr kCols { 13 };
|
||||||
|
|
||||||
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
|
|
||||||
size_t cols = kCols, size_t batches = kBatches);
|
|
||||||
~CudaArrayIterForTest();
|
|
||||||
|
|
||||||
std::string AsArray() const {
|
std::string AsArray() const {
|
||||||
return interface_;
|
return interface_;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Next();
|
virtual int Next();
|
||||||
void Reset() {
|
virtual void Reset() {
|
||||||
iter_ = 0;
|
iter_ = 0;
|
||||||
}
|
}
|
||||||
size_t Iter() const { return iter_; }
|
size_t Iter() const { return iter_; }
|
||||||
auto Proxy() -> decltype(proxy_) { return proxy_; }
|
auto Proxy() -> decltype(proxy_) { return proxy_; }
|
||||||
|
|
||||||
|
explicit ArrayIterForTest(float sparsity, size_t rows = kRows,
|
||||||
|
size_t cols = kCols, size_t batches = kBatches);
|
||||||
|
virtual ~ArrayIterForTest();
|
||||||
|
};
|
||||||
|
|
||||||
|
class CudaArrayIterForTest : public ArrayIterForTest {
|
||||||
|
public:
|
||||||
|
size_t static constexpr kRows{1000};
|
||||||
|
size_t static constexpr kBatches{100};
|
||||||
|
size_t static constexpr kCols{13};
|
||||||
|
|
||||||
|
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
|
||||||
|
size_t cols = kCols, size_t batches = kBatches);
|
||||||
|
int Next() override;
|
||||||
|
~CudaArrayIterForTest() override = default;
|
||||||
};
|
};
|
||||||
|
|
||||||
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
||||||
@ -396,11 +410,11 @@ void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
|||||||
typedef void *DataIterHandle; // NOLINT(*)
|
typedef void *DataIterHandle; // NOLINT(*)
|
||||||
|
|
||||||
inline void Reset(DataIterHandle self) {
|
inline void Reset(DataIterHandle self) {
|
||||||
static_cast<CudaArrayIterForTest*>(self)->Reset();
|
static_cast<ArrayIterForTest*>(self)->Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int Next(DataIterHandle self) {
|
inline int Next(DataIterHandle self) {
|
||||||
return static_cast<CudaArrayIterForTest*>(self)->Next();
|
return static_cast<ArrayIterForTest*>(self)->Next();
|
||||||
}
|
}
|
||||||
|
|
||||||
class RMMAllocator;
|
class RMMAllocator;
|
||||||
|
|||||||
@ -92,13 +92,10 @@ TEST(CpuPredictor, IterationRange) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(CpuPredictor, ExternalMemory) {
|
TEST(CpuPredictor, ExternalMemory) {
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
|
||||||
std::string filename = tmpdir.path + "/big.libsvm";
|
|
||||||
|
|
||||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
|
||||||
auto lparam = CreateEmptyGenericParam(GPUIDX);
|
auto lparam = CreateEmptyGenericParam(GPUIDX);
|
||||||
|
|
||||||
std::unique_ptr<Predictor> cpu_predictor =
|
std::unique_ptr<Predictor> cpu_predictor =
|
||||||
|
|||||||
@ -102,13 +102,10 @@ TEST(GPUPredictor, ExternalMemoryTest) {
|
|||||||
|
|
||||||
gbm::GBTreeModel model = CreateTestModel(¶m, n_classes);
|
gbm::GBTreeModel model = CreateTestModel(¶m, n_classes);
|
||||||
std::vector<std::unique_ptr<DMatrix>> dmats;
|
std::vector<std::unique_ptr<DMatrix>> dmats;
|
||||||
dmlc::TemporaryDirectory tmpdir;
|
|
||||||
std::string file0 = tmpdir.path + "/big_0.libsvm";
|
dmats.push_back(CreateSparsePageDMatrix(400));
|
||||||
std::string file1 = tmpdir.path + "/big_1.libsvm";
|
dmats.push_back(CreateSparsePageDMatrix(800));
|
||||||
std::string file2 = tmpdir.path + "/big_2.libsvm";
|
dmats.push_back(CreateSparsePageDMatrix(8000));
|
||||||
dmats.push_back(CreateSparsePageDMatrix(400, 64UL, file0));
|
|
||||||
dmats.push_back(CreateSparsePageDMatrix(800, 128UL, file1));
|
|
||||||
dmats.push_back(CreateSparsePageDMatrix(8000, 1024UL, file2));
|
|
||||||
|
|
||||||
for (const auto& dmat: dmats) {
|
for (const auto& dmat: dmats) {
|
||||||
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
|
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
|
||||||
|
|||||||
@ -98,8 +98,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT
|
|||||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
||||||
CreateBigTestData(tmp_file, 50000);
|
CreateBigTestData(tmp_file, 50000);
|
||||||
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(
|
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(
|
||||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 100));
|
tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
|
||||||
EXPECT_FALSE(dmat->SingleColBlock());
|
EXPECT_FALSE(dmat->SingleColBlock());
|
||||||
size_t num_row = dmat->Info().num_row_;
|
size_t num_row = dmat->Info().num_row_;
|
||||||
std::vector<bst_float> labels(num_row);
|
std::vector<bst_float> labels(num_row);
|
||||||
|
|||||||
@ -27,7 +27,7 @@ void VerifySampling(size_t page_size,
|
|||||||
}
|
}
|
||||||
gpair.SetDevice(0);
|
gpair.SetDevice(0);
|
||||||
|
|
||||||
BatchParam param{0, 256, page_size};
|
BatchParam param{0, 256};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
if (page_size != 0) {
|
if (page_size != 0) {
|
||||||
EXPECT_NE(page->n_rows, kRows);
|
EXPECT_NE(page->n_rows, kRows);
|
||||||
@ -82,7 +82,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
|
|||||||
auto gpair = GenerateRandomGradients(kRows);
|
auto gpair = GenerateRandomGradients(kRows);
|
||||||
gpair.SetDevice(0);
|
gpair.SetDevice(0);
|
||||||
|
|
||||||
BatchParam param{0, 256, kPageSize};
|
BatchParam param{0, 256};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_NE(page->n_rows, kRows);
|
EXPECT_NE(page->n_rows, kRows);
|
||||||
|
|
||||||
|
|||||||
@ -15,7 +15,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
|
|||||||
|
|
||||||
float sparsity = is_dense ? 0.0f : 0.5f;
|
float sparsity = is_dense ? 0.0f : 0.5f;
|
||||||
auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix();
|
auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix();
|
||||||
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0};
|
BatchParam batch_param{0, static_cast<int32_t>(kBins)};
|
||||||
|
|
||||||
for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) {
|
for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) {
|
||||||
auto* page = batch.Impl();
|
auto* page = batch.Impl();
|
||||||
@ -116,7 +116,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
|
|||||||
auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
|
auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
|
||||||
auto cat_m = GetDMatrixFromData(x, kRows, 1);
|
auto cat_m = GetDMatrixFromData(x, kRows, 1);
|
||||||
cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
|
cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||||
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0};
|
BatchParam batch_param{0, static_cast<int32_t>(kBins)};
|
||||||
tree::RowPartitioner row_partitioner(0, kRows);
|
tree::RowPartitioner row_partitioner(0, kRows);
|
||||||
auto ridx = row_partitioner.GetRows(0);
|
auto ridx = row_partitioner.GetRows(0);
|
||||||
dh::device_vector<GradientPairPrecise> cat_hist(num_categories);
|
dh::device_vector<GradientPairPrecise> cat_hist(num_categories);
|
||||||
|
|||||||
@ -152,7 +152,6 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
BatchParam bparam;
|
BatchParam bparam;
|
||||||
bparam.gpu_id = 0;
|
bparam.gpu_id = 0;
|
||||||
bparam.max_bin = 3;
|
bparam.max_bin = 3;
|
||||||
bparam.gpu_page_size = 0;
|
|
||||||
|
|
||||||
for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
|
for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
|
||||||
auto impl = ellpack.Impl();
|
auto impl = ellpack.Impl();
|
||||||
@ -291,9 +290,13 @@ void TestHistogramIndexImpl() {
|
|||||||
// Extract the device maker from the histogram makers and from that its compressed
|
// Extract the device maker from the histogram makers and from that its compressed
|
||||||
// histogram index
|
// histogram index
|
||||||
const auto &maker = hist_maker.maker;
|
const auto &maker = hist_maker.maker;
|
||||||
|
auto grad = GenerateRandomGradients(kNRows);
|
||||||
|
grad.SetDevice(0);
|
||||||
|
maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
|
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
|
||||||
|
|
||||||
const auto &maker_ext = hist_maker_ext.maker;
|
const auto &maker_ext = hist_maker_ext.maker;
|
||||||
|
maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
|
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
|
||||||
|
|
||||||
ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
|
ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
|
||||||
@ -365,7 +368,7 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
|||||||
// Loop over the batches and count the records
|
// Loop over the batches and count the records
|
||||||
int64_t batch_count = 0;
|
int64_t batch_count = 0;
|
||||||
int64_t row_count = 0;
|
int64_t row_count = 0;
|
||||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin, gpu_page_size})) {
|
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin})) {
|
||||||
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
|
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
|
||||||
batch_count++;
|
batch_count++;
|
||||||
row_count += batch.Size();
|
row_count += batch.Size();
|
||||||
@ -386,7 +389,6 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
|||||||
|
|
||||||
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker;
|
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker;
|
||||||
GenericParameter generic_param(CreateEmptyGenericParam(0));
|
GenericParameter generic_param(CreateEmptyGenericParam(0));
|
||||||
generic_param.gpu_page_size = gpu_page_size;
|
|
||||||
hist_maker.Configure(args, &generic_param);
|
hist_maker.Configure(args, &generic_param);
|
||||||
|
|
||||||
hist_maker.Update(gpair, dmat, {tree});
|
hist_maker.Update(gpair, dmat, {tree});
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user