Initial support for external memory in gradient index. (#7183)

* Add hessian to batch param in preparation of new approx impl.
* Extract a push method for gradient index matrix.
* Use span instead of vector ref for hessian in sketching.
* Create a binary format for gradient index.
This commit is contained in:
Jiaming Yuan
2021-09-13 12:40:56 +08:00
committed by GitHub
parent a0dcf6f5c1
commit 3515931305
26 changed files with 546 additions and 171 deletions

View File

@@ -111,7 +111,7 @@ class HistogramCuts {
};
inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
std::vector<float> const &hessian = {}) {
Span<float> const hessian = {}) {
HistogramCuts out;
auto const& info = m->Info();
const auto threads = omp_get_max_threads();
@@ -136,7 +136,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
return out;
}
enum BinTypeSize {
enum BinTypeSize : uint32_t {
kUint8BinsTypeSize = 1,
kUint16BinsTypeSize = 2,
kUint32BinsTypeSize = 4
@@ -207,6 +207,13 @@ struct Index {
return data_.end();
}
std::vector<uint8_t>::iterator begin() { // NOLINT
return data_.begin();
}
std::vector<uint8_t>::iterator end() { // NOLINT
return data_.end();
}
private:
static uint32_t GetValueFromUint8(void *t, size_t i) {
return reinterpret_cast<uint8_t*>(t)[i];

View File

@@ -94,26 +94,26 @@ std::vector<bst_feature_t> HostSketchContainer::LoadBalance(
namespace {
// Function to merge hessian and sample weights
std::vector<float> MergeWeights(MetaInfo const &info,
std::vector<float> const &hessian,
Span<float> const hessian,
bool use_group, int32_t n_threads) {
CHECK_EQ(hessian.size(), info.num_row_);
std::vector<float> results(hessian.size());
auto const &group_ptr = info.group_ptr_;
auto const& weights = info.weights_.HostVector();
auto get_weight = [&](size_t i) { return weights.empty() ? 1.0f : weights[i]; };
if (use_group) {
auto const &group_weights = info.weights_.HostVector();
CHECK_GE(group_ptr.size(), 2);
CHECK_EQ(group_ptr.back(), hessian.size());
size_t cur_group = 0;
for (size_t i = 0; i < hessian.size(); ++i) {
results[i] = hessian[i] * group_weights[cur_group];
results[i] = hessian[i] * get_weight(cur_group);
if (i == group_ptr[cur_group + 1]) {
cur_group++;
}
}
} else {
auto const &sample_weights = info.weights_.HostVector();
ParallelFor(hessian.size(), n_threads, Sched::Auto(),
[&](auto i) { results[i] = hessian[i] * sample_weights[i]; });
[&](auto i) { results[i] = hessian[i] * get_weight(i); });
}
return results;
}
@@ -141,7 +141,7 @@ std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
} // anonymous namespace
void HostSketchContainer::PushRowPage(
SparsePage const &page, MetaInfo const &info, std::vector<float> const &hessian) {
SparsePage const &page, MetaInfo const &info, Span<float> hessian) {
monitor_.Start(__func__);
bst_feature_t n_columns = info.num_col_;
auto is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;

View File

@@ -760,7 +760,7 @@ class HostSketchContainer {
/* \brief Push a CSR matrix. */
void PushRowPage(SparsePage const &page, MetaInfo const &info,
std::vector<float> const &hessian = {});
Span<float> const hessian = {});
void MakeCuts(HistogramCuts* cuts);
};