Split up column matrix initialization. (#8060)
* Split up column matrix initialization. This PR splits the column matrix initialization into 2 steps, the first one initializes the storage while the second one does the transpose. By doing so, we can reuse the code for Quantile DMatrix.
This commit is contained in:
parent
36cf979b82
commit
8dd96013f1
@ -69,7 +69,10 @@
|
|||||||
#include "../src/learner.cc"
|
#include "../src/learner.cc"
|
||||||
#include "../src/logging.cc"
|
#include "../src/logging.cc"
|
||||||
#include "../src/global_config.cc"
|
#include "../src/global_config.cc"
|
||||||
|
|
||||||
|
// common
|
||||||
#include "../src/common/common.cc"
|
#include "../src/common/common.cc"
|
||||||
|
#include "../src/common/column_matrix.cc"
|
||||||
#include "../src/common/random.cc"
|
#include "../src/common/random.cc"
|
||||||
#include "../src/common/charconv.cc"
|
#include "../src/common/charconv.cc"
|
||||||
#include "../src/common/timer.cc"
|
#include "../src/common/timer.cc"
|
||||||
|
|||||||
65
src/common/column_matrix.cc
Normal file
65
src/common/column_matrix.cc
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2017-2022 by XGBoost Contributors
|
||||||
|
* \brief Utility for fast column-wise access
|
||||||
|
*/
|
||||||
|
#include "column_matrix.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace common {
|
||||||
|
void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
|
||||||
|
auto const nfeature = gmat.Features();
|
||||||
|
const size_t nrow = gmat.Size();
|
||||||
|
// identify type of each column
|
||||||
|
type_.resize(nfeature);
|
||||||
|
|
||||||
|
uint32_t max_val = std::numeric_limits<uint32_t>::max();
|
||||||
|
for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
|
||||||
|
CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool all_dense_column = true;
|
||||||
|
|
||||||
|
std::vector<size_t> feature_counts(nfeature, 0);
|
||||||
|
gmat.GetFeatureCounts(feature_counts.data());
|
||||||
|
|
||||||
|
// classify features
|
||||||
|
for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
|
||||||
|
if (static_cast<double>(feature_counts[fid]) < sparse_threshold * nrow) {
|
||||||
|
type_[fid] = kSparseColumn;
|
||||||
|
all_dense_column = false;
|
||||||
|
} else {
|
||||||
|
type_[fid] = kDenseColumn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// want to compute storage boundary for each feature
|
||||||
|
// using variants of prefix sum scan
|
||||||
|
feature_offsets_.resize(nfeature + 1);
|
||||||
|
size_t accum_index = 0;
|
||||||
|
feature_offsets_[0] = accum_index;
|
||||||
|
for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
|
||||||
|
if (type_[fid - 1] == kDenseColumn) {
|
||||||
|
accum_index += static_cast<size_t>(nrow);
|
||||||
|
} else {
|
||||||
|
accum_index += feature_counts[fid - 1];
|
||||||
|
}
|
||||||
|
feature_offsets_[fid] = accum_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
SetTypeSize(gmat.max_num_bins);
|
||||||
|
auto storage_size =
|
||||||
|
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
||||||
|
index_.resize(storage_size, 0);
|
||||||
|
if (!all_dense_column) {
|
||||||
|
row_ind_.resize(feature_offsets_[nfeature]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// store least bin id for each feature
|
||||||
|
index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
|
||||||
|
|
||||||
|
any_missing_ = !gmat.IsDense();
|
||||||
|
|
||||||
|
missing_flags_.clear();
|
||||||
|
}
|
||||||
|
} // namespace common
|
||||||
|
} // namespace xgboost
|
||||||
@ -133,77 +133,33 @@ class DenseColumnIter : public Column<BinIdxT> {
|
|||||||
* column.
|
* column.
|
||||||
*/
|
*/
|
||||||
class ColumnMatrix {
|
class ColumnMatrix {
|
||||||
|
void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// get number of features
|
// get number of features
|
||||||
bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
|
bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
|
||||||
|
|
||||||
|
ColumnMatrix() = default;
|
||||||
|
ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
|
||||||
|
this->InitStorage(gmat, sparse_threshold);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Batch>
|
template <typename Batch>
|
||||||
void Init(Batch const& batch, float missing, GHistIndexMatrix const& gmat,
|
void PushBatch(int32_t n_threads, Batch const& batch, float missing, GHistIndexMatrix const& gmat,
|
||||||
double sparse_threshold, int32_t n_threads) {
|
size_t base_rowid) {
|
||||||
auto const nfeature = static_cast<bst_feature_t>(gmat.cut.Ptrs().size() - 1);
|
|
||||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
|
||||||
// identify type of each column
|
|
||||||
feature_counts_.resize(nfeature);
|
|
||||||
type_.resize(nfeature);
|
|
||||||
std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
|
|
||||||
uint32_t max_val = std::numeric_limits<uint32_t>::max();
|
|
||||||
for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
|
|
||||||
CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool all_dense_column = true;
|
|
||||||
gmat.GetFeatureCounts(&feature_counts_[0]);
|
|
||||||
// classify features
|
|
||||||
for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
|
|
||||||
if (static_cast<double>(feature_counts_[fid]) < sparse_threshold * nrow) {
|
|
||||||
type_[fid] = kSparseColumn;
|
|
||||||
all_dense_column = false;
|
|
||||||
} else {
|
|
||||||
type_[fid] = kDenseColumn;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// want to compute storage boundary for each feature
|
|
||||||
// using variants of prefix sum scan
|
|
||||||
feature_offsets_.resize(nfeature + 1);
|
|
||||||
size_t accum_index = 0;
|
|
||||||
feature_offsets_[0] = accum_index;
|
|
||||||
for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
|
|
||||||
if (type_[fid - 1] == kDenseColumn) {
|
|
||||||
accum_index += static_cast<size_t>(nrow);
|
|
||||||
} else {
|
|
||||||
accum_index += feature_counts_[fid - 1];
|
|
||||||
}
|
|
||||||
feature_offsets_[fid] = accum_index;
|
|
||||||
}
|
|
||||||
|
|
||||||
SetTypeSize(gmat.max_num_bins);
|
|
||||||
auto storage_size =
|
|
||||||
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
|
||||||
index_.resize(storage_size, 0);
|
|
||||||
if (!all_dense_column) {
|
|
||||||
row_ind_.resize(feature_offsets_[nfeature]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// store least bin id for each feature
|
|
||||||
index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
|
|
||||||
|
|
||||||
any_missing_ = !gmat.IsDense();
|
|
||||||
|
|
||||||
missing_flags_.clear();
|
|
||||||
|
|
||||||
// pre-fill index_ for dense columns
|
// pre-fill index_ for dense columns
|
||||||
BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
|
auto n_features = gmat.Features();
|
||||||
if (!any_missing_) {
|
if (!any_missing_) {
|
||||||
missing_flags_.resize(feature_offsets_[nfeature], false);
|
missing_flags_.resize(feature_offsets_[n_features], false);
|
||||||
// row index is compressed, we need to dispatch it.
|
// row index is compressed, we need to dispatch it.
|
||||||
DispatchBinType(gmat_bin_size, [&, nrow, nfeature, n_threads](auto t) {
|
DispatchBinType(gmat.index.GetBinTypeSize(), [&, size = batch.Size(), n_features = n_features,
|
||||||
|
n_threads = n_threads](auto t) {
|
||||||
using RowBinIdxT = decltype(t);
|
using RowBinIdxT = decltype(t);
|
||||||
SetIndexNoMissing(gmat.index.data<RowBinIdxT>(), nrow, nfeature, n_threads);
|
SetIndexNoMissing(base_rowid, gmat.index.data<RowBinIdxT>(), size, n_features, n_threads);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
missing_flags_.resize(feature_offsets_[nfeature], true);
|
missing_flags_.resize(feature_offsets_[n_features], true);
|
||||||
SetIndexMixedColumns(batch, gmat.index.data<uint32_t>(), gmat, nfeature, missing);
|
SetIndexMixedColumns(base_rowid, batch, gmat, n_features, missing);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,7 +167,9 @@ class ColumnMatrix {
|
|||||||
void Init(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
|
void Init(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
|
||||||
int32_t n_threads) {
|
int32_t n_threads) {
|
||||||
auto batch = data::SparsePageAdapterBatch{page.GetView()};
|
auto batch = data::SparsePageAdapterBatch{page.GetView()};
|
||||||
this->Init(batch, std::numeric_limits<float>::quiet_NaN(), gmat, sparse_threshold, n_threads);
|
this->InitStorage(gmat, sparse_threshold);
|
||||||
|
// ignore base row id here as we always has one column matrix for each sparse page.
|
||||||
|
this->PushBatch(n_threads, batch, std::numeric_limits<float>::quiet_NaN(), gmat, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
|
/* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
|
||||||
@ -250,17 +208,17 @@ class ColumnMatrix {
|
|||||||
// all columns are dense column and has no missing value
|
// all columns are dense column and has no missing value
|
||||||
// FIXME(jiamingy): We don't need a column matrix if there's no missing value.
|
// FIXME(jiamingy): We don't need a column matrix if there's no missing value.
|
||||||
template <typename RowBinIdxT>
|
template <typename RowBinIdxT>
|
||||||
void SetIndexNoMissing(RowBinIdxT const* row_index, const size_t n_samples,
|
void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
|
||||||
const size_t n_features, int32_t n_threads) {
|
const size_t n_features, int32_t n_threads) {
|
||||||
DispatchBinType(bins_type_size_, [&](auto t) {
|
DispatchBinType(bins_type_size_, [&](auto t) {
|
||||||
using ColumnBinT = decltype(t);
|
using ColumnBinT = decltype(t);
|
||||||
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
|
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
|
||||||
index_.size() / sizeof(ColumnBinT)};
|
index_.size() / sizeof(ColumnBinT)};
|
||||||
ParallelFor(n_samples, n_threads, [&](auto rid) {
|
ParallelFor(n_samples, n_threads, [&](auto rid) {
|
||||||
|
rid += base_rowid;
|
||||||
const size_t ibegin = rid * n_features;
|
const size_t ibegin = rid * n_features;
|
||||||
const size_t iend = (rid + 1) * n_features;
|
const size_t iend = (rid + 1) * n_features;
|
||||||
size_t j = 0;
|
for (size_t i = ibegin, j = 0; i < iend; ++i, ++j) {
|
||||||
for (size_t i = ibegin; i < iend; ++i, ++j) {
|
|
||||||
const size_t idx = feature_offsets_[j];
|
const size_t idx = feature_offsets_[j];
|
||||||
// No need to add offset, as row index is compressed and stores the local index
|
// No need to add offset, as row index is compressed and stores the local index
|
||||||
column_index[idx + rid] = row_index[i];
|
column_index[idx + rid] = row_index[i];
|
||||||
@ -273,16 +231,15 @@ class ColumnMatrix {
|
|||||||
* \brief Set column index for both dense and sparse columns
|
* \brief Set column index for both dense and sparse columns
|
||||||
*/
|
*/
|
||||||
template <typename Batch>
|
template <typename Batch>
|
||||||
void SetIndexMixedColumns(Batch const& batch, uint32_t const* row_index,
|
void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
|
||||||
const GHistIndexMatrix& gmat, size_t n_features, float missing) {
|
size_t n_features, float missing) {
|
||||||
std::vector<size_t> num_nonzeros;
|
auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
|
||||||
num_nonzeros.resize(n_features, 0);
|
|
||||||
auto is_valid = data::IsValidFunctor {missing};
|
auto is_valid = data::IsValidFunctor {missing};
|
||||||
|
|
||||||
DispatchBinType(bins_type_size_, [&](auto t) {
|
DispatchBinType(bins_type_size_, [&](auto t) {
|
||||||
using ColumnBinT = decltype(t);
|
using ColumnBinT = decltype(t);
|
||||||
ColumnBinT* local_index = reinterpret_cast<ColumnBinT*>(index_.data());
|
ColumnBinT* local_index = reinterpret_cast<ColumnBinT*>(index_.data());
|
||||||
|
num_nonzeros_.resize(n_features, 0);
|
||||||
auto get_bin_idx = [&](auto bin_id, auto rid, bst_feature_t fid) {
|
auto get_bin_idx = [&](auto bin_id, auto rid, bst_feature_t fid) {
|
||||||
if (type_[fid] == kDenseColumn) {
|
if (type_[fid] == kDenseColumn) {
|
||||||
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
|
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
|
||||||
@ -292,13 +249,13 @@ class ColumnMatrix {
|
|||||||
missing_flags_[feature_offsets_[fid] + rid] = false;
|
missing_flags_[feature_offsets_[fid] + rid] = false;
|
||||||
} else {
|
} else {
|
||||||
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
|
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
|
||||||
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
|
begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
|
||||||
row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid;
|
row_ind_[feature_offsets_[fid] + num_nonzeros_[fid]] = rid;
|
||||||
++num_nonzeros[fid];
|
++num_nonzeros_[fid];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const size_t batch_size = gmat.Size();
|
size_t const batch_size = batch.Size();
|
||||||
size_t k{0};
|
size_t k{0};
|
||||||
for (size_t rid = 0; rid < batch_size; ++rid) {
|
for (size_t rid = 0; rid < batch_size; ++rid) {
|
||||||
auto line = batch.GetLine(rid);
|
auto line = batch.GetLine(rid);
|
||||||
@ -307,7 +264,7 @@ class ColumnMatrix {
|
|||||||
if (is_valid(coo)) {
|
if (is_valid(coo)) {
|
||||||
auto fid = coo.column_idx;
|
auto fid = coo.column_idx;
|
||||||
const uint32_t bin_id = row_index[k];
|
const uint32_t bin_id = row_index[k];
|
||||||
get_bin_idx(bin_id, rid, fid);
|
get_bin_idx(bin_id, rid + base_rowid, fid);
|
||||||
++k;
|
++k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -324,7 +281,6 @@ class ColumnMatrix {
|
|||||||
// IO procedures for external memory.
|
// IO procedures for external memory.
|
||||||
bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
|
bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
|
||||||
fi->Read(&index_);
|
fi->Read(&index_);
|
||||||
fi->Read(&feature_counts_);
|
|
||||||
#if !DMLC_LITTLE_ENDIAN
|
#if !DMLC_LITTLE_ENDIAN
|
||||||
// s390x
|
// s390x
|
||||||
std::vector<std::underlying_type<ColumnType>::type> int_types;
|
std::vector<std::underlying_type<ColumnType>::type> int_types;
|
||||||
@ -361,7 +317,6 @@ class ColumnMatrix {
|
|||||||
sizeof(uint64_t);
|
sizeof(uint64_t);
|
||||||
};
|
};
|
||||||
write_vec(index_);
|
write_vec(index_);
|
||||||
write_vec(feature_counts_);
|
|
||||||
#if !DMLC_LITTLE_ENDIAN
|
#if !DMLC_LITTLE_ENDIAN
|
||||||
// s390x
|
// s390x
|
||||||
std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
|
std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
|
||||||
@ -391,11 +346,13 @@ class ColumnMatrix {
|
|||||||
private:
|
private:
|
||||||
std::vector<uint8_t> index_;
|
std::vector<uint8_t> index_;
|
||||||
|
|
||||||
std::vector<size_t> feature_counts_;
|
|
||||||
std::vector<ColumnType> type_;
|
std::vector<ColumnType> type_;
|
||||||
|
/* indptr of a CSC matrix. */
|
||||||
std::vector<size_t> row_ind_;
|
std::vector<size_t> row_ind_;
|
||||||
/* indicate where each column's index and row_ind is stored. */
|
/* indicate where each column's index and row_ind is stored. */
|
||||||
std::vector<size_t> feature_offsets_;
|
std::vector<size_t> feature_offsets_;
|
||||||
|
/* The number of nnz of each column. */
|
||||||
|
std::vector<size_t> num_nonzeros_;
|
||||||
|
|
||||||
// index_base_[fid]: least bin id for feature fid
|
// index_base_[fid]: least bin id for feature fid
|
||||||
uint32_t const* index_base_;
|
uint32_t const* index_base_;
|
||||||
|
|||||||
@ -109,9 +109,8 @@ class GHistIndexMatrix {
|
|||||||
*/
|
*/
|
||||||
size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
|
size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
|
||||||
|
|
||||||
bst_row_t Size() const {
|
bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
|
||||||
return row_ptr.empty() ? 0 : row_ptr.size() - 1;
|
bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
|
||||||
}
|
|
||||||
|
|
||||||
bool ReadColumnPage(dmlc::SeekStream* fi);
|
bool ReadColumnPage(dmlc::SeekStream* fi);
|
||||||
size_t WriteColumnPage(dmlc::Stream* fo) const;
|
size_t WriteColumnPage(dmlc::Stream* fo) const;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user