Initial support for external memory in gradient index. (#7183)

* Add hessian to batch param in preparation of new approx impl.
* Extract a push method for gradient index matrix.
* Use span instead of vector ref for hessian in sketching.
* Create a binary format for gradient index.
This commit is contained in:
Jiaming Yuan
2021-09-13 12:40:56 +08:00
committed by GitHub
parent a0dcf6f5c1
commit 3515931305
26 changed files with 546 additions and 171 deletions

View File

@@ -18,6 +18,9 @@ namespace xgboost {
* index for CPU histogram. On GPU ellpack page is used.
*/
class GHistIndexMatrix {
void PushBatch(SparsePage const &batch, size_t rbegin, size_t prev_sum,
uint32_t nbins, int32_t n_threads);
public:
/*! \brief row pointer to rows by element position */
std::vector<size_t> row_ptr;
@@ -29,12 +32,16 @@ class GHistIndexMatrix {
common::HistogramCuts cut;
DMatrix* p_fmat;
size_t max_num_bins;
size_t base_rowid{0};
GHistIndexMatrix(DMatrix* x, int32_t max_bin) {
this->Init(x, max_bin);
GHistIndexMatrix() = default;
GHistIndexMatrix(DMatrix* x, int32_t max_bin, common::Span<float> hess = {}) {
this->Init(x, max_bin, hess);
}
// Create a global histogram matrix, given cut
void Init(DMatrix* p_fmat, int max_num_bins);
void Init(DMatrix* p_fmat, int max_num_bins, common::Span<float> hess);
void Init(SparsePage const &page, common::HistogramCuts const &cuts,
int32_t max_bins_per_feat, bool is_dense, int32_t n_threads);
// specific method for sparse data as no possibility to reduce allocated memory
template <typename BinIdxType, typename GetOffset>
@@ -77,6 +84,11 @@ class GHistIndexMatrix {
inline bool IsDense() const {
return isDense_;
}
void SetDense(bool is_dense) { isDense_ = is_dense; }
bst_row_t Size() const {
return row_ptr.empty() ? 0 : row_ptr.size() - 1;
}
private:
std::vector<size_t> hit_count_tloc_;