Optional by-column histogram build. (#8233)
Co-authored-by: dmitry.razdoburdin <drazdobu@jfldaal005.jf.intel.com>
This commit is contained in:
parent
b791446623
commit
eb7bbee2c9
@ -140,7 +140,7 @@ struct Prefetch {
|
||||
constexpr size_t Prefetch::kNoPrefetchSize;
|
||||
|
||||
template <bool do_prefetch, typename BinIdxType, bool first_page, bool any_missing = true>
|
||||
void BuildHistKernel(const std::vector<GradientPair> &gpair,
|
||||
void RowsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
|
||||
GHistRow hist) {
|
||||
const size_t size = row_indices.Size();
|
||||
@ -204,75 +204,136 @@ void BuildHistKernel(const std::vector<GradientPair> &gpair,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename BinIdxType, bool first_page, bool any_missing>
|
||||
void ColsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
|
||||
GHistRow hist) {
|
||||
const size_t size = row_indices.Size();
|
||||
const size_t *rid = row_indices.begin;
|
||||
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
|
||||
const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
|
||||
|
||||
auto const &row_ptr = gmat.row_ptr.data();
|
||||
auto base_rowid = gmat.base_rowid;
|
||||
const uint32_t *offsets = gmat.index.Offset();
|
||||
auto get_row_ptr = [&](size_t ridx) {
|
||||
return first_page ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
|
||||
};
|
||||
auto get_rid = [&](size_t ridx) {
|
||||
return first_page ? ridx : (ridx - base_rowid);
|
||||
};
|
||||
|
||||
const size_t n_features = gmat.cut.Ptrs().size() - 1;
|
||||
const size_t n_columns = n_features;
|
||||
auto hist_data = reinterpret_cast<double *>(hist.data());
|
||||
const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains
|
||||
// 2 FP values: gradient and hessian.
|
||||
// So we need to multiply each row-index/bin-index by 2
|
||||
// to work with gradient pairs as a singe row FP array
|
||||
for (size_t cid = 0; cid < n_columns; ++cid) {
|
||||
const uint32_t offset = any_missing ? 0 : offsets[cid];
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const size_t row_id = rid[i];
|
||||
const size_t icol_start =
|
||||
any_missing ? get_row_ptr(row_id) : get_rid(row_id) * n_features;
|
||||
const size_t icol_end =
|
||||
any_missing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;
|
||||
|
||||
if (cid < icol_end - icol_start) {
|
||||
const BinIdxType *gr_index_local = gradient_index + icol_start;
|
||||
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[cid]) + offset);
|
||||
auto hist_local = hist_data + idx_bin;
|
||||
|
||||
const size_t idx_gh = two * row_id;
|
||||
// The trick with pgh_t buffer helps the compiler to generate faster binary.
|
||||
const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
|
||||
*(hist_local) += pgh_t[0];
|
||||
*(hist_local + 1) += pgh_t[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool do_prefetch, typename BinIdxType, bool first_page,
|
||||
bool any_missing>
|
||||
void BuildHistKernel(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
|
||||
GHistRow hist, bool read_by_column) {
|
||||
if (read_by_column) {
|
||||
ColsWiseBuildHistKernel<BinIdxType, first_page, any_missing>
|
||||
(gpair, row_indices, gmat, hist);
|
||||
} else {
|
||||
RowsWiseBuildHistKernel<do_prefetch, BinIdxType, first_page, any_missing>
|
||||
(gpair, row_indices, gmat, hist);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool do_prefetch, bool any_missing>
|
||||
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
|
||||
GHistRow hist) {
|
||||
GHistRow hist, bool read_by_column) {
|
||||
auto first_page = gmat.base_rowid == 0;
|
||||
DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) {
|
||||
using BinIdxType = decltype(t);
|
||||
if (first_page) {
|
||||
switch (gmat.index.GetBinTypeSize()) {
|
||||
case kUint8BinsTypeSize:
|
||||
BuildHistKernel<do_prefetch, uint8_t, true, any_missing>(gpair, row_indices, gmat, hist);
|
||||
break;
|
||||
case kUint16BinsTypeSize:
|
||||
BuildHistKernel<do_prefetch, uint16_t, true, any_missing>(gpair, row_indices, gmat, hist);
|
||||
break;
|
||||
case kUint32BinsTypeSize:
|
||||
BuildHistKernel<do_prefetch, uint32_t, true, any_missing>(gpair, row_indices, gmat, hist);
|
||||
break;
|
||||
default:
|
||||
CHECK(false); // no default behavior
|
||||
}
|
||||
BuildHistKernel<do_prefetch, BinIdxType, true, any_missing>
|
||||
(gpair, row_indices, gmat, hist, read_by_column);
|
||||
} else {
|
||||
switch (gmat.index.GetBinTypeSize()) {
|
||||
case kUint8BinsTypeSize:
|
||||
BuildHistKernel<do_prefetch, uint8_t, false, any_missing>(gpair, row_indices, gmat, hist);
|
||||
break;
|
||||
case kUint16BinsTypeSize:
|
||||
BuildHistKernel<do_prefetch, uint16_t, false, any_missing>(gpair, row_indices, gmat, hist);
|
||||
break;
|
||||
case kUint32BinsTypeSize:
|
||||
BuildHistKernel<do_prefetch, uint32_t, false, any_missing>(gpair, row_indices, gmat, hist);
|
||||
break;
|
||||
default:
|
||||
CHECK(false); // no default behavior
|
||||
}
|
||||
BuildHistKernel<do_prefetch, BinIdxType, false, any_missing>
|
||||
(gpair, row_indices, gmat, hist, read_by_column);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <bool any_missing>
|
||||
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
|
||||
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
|
||||
GHistRow hist) const {
|
||||
GHistRow hist, bool read_by_column) {
|
||||
const size_t nrows = row_indices.Size();
|
||||
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);
|
||||
|
||||
// if need to work with all rows from bin-matrix (e.g. root node)
|
||||
const bool contiguousBlock =
|
||||
(row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);
|
||||
|
||||
if (contiguousBlock) {
|
||||
// contiguous memory access, built-in HW prefetching is enough
|
||||
BuildHistDispatch<false, any_missing>(gpair, row_indices,
|
||||
gmat, hist);
|
||||
BuildHistDispatch<false, any_missing>(gpair, row_indices, gmat, hist, read_by_column);
|
||||
} else {
|
||||
const RowSetCollection::Elem span1(row_indices.begin,
|
||||
row_indices.end - no_prefetch_size);
|
||||
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
|
||||
row_indices.end);
|
||||
|
||||
BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist);
|
||||
BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist, read_by_column);
|
||||
// no prefetching to avoid loading extra memory
|
||||
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist);
|
||||
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist, read_by_column);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool any_missing>
|
||||
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix &gmat,
|
||||
GHistRow hist, bool force_read_by_column) const {
|
||||
/* force_read_by_column is used for testing the columnwise building of histograms.
|
||||
* default force_read_by_column = false
|
||||
*/
|
||||
constexpr double kAdhocL2Size = 1024 * 1024 * 0.8;
|
||||
const bool hist_fit_to_l2 = kAdhocL2Size > 2*sizeof(float)*gmat.cut.Ptrs().back();
|
||||
const bool read_by_column = !hist_fit_to_l2 && !any_missing;
|
||||
|
||||
BuildHistDispatch<any_missing>(gpair, row_indices, gmat, hist, read_by_column ||
|
||||
force_read_by_column);
|
||||
}
|
||||
|
||||
template void GHistBuilder::BuildHist<true>(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix &gmat, GHistRow hist) const;
|
||||
const GHistIndexMatrix &gmat, GHistRow hist,
|
||||
bool force_read_by_column) const;
|
||||
|
||||
template void GHistBuilder::BuildHist<false>(const std::vector<GradientPair> &gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix &gmat, GHistRow hist) const;
|
||||
const GHistIndexMatrix &gmat, GHistRow hist,
|
||||
bool force_read_by_column) const;
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@ -623,7 +623,8 @@ class GHistBuilder {
|
||||
// construct a histogram via histogram aggregation
|
||||
template <bool any_missing>
|
||||
void BuildHist(const std::vector<GradientPair>& gpair, const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat, GHistRow hist) const;
|
||||
const GHistIndexMatrix& gmat, GHistRow hist,
|
||||
bool force_read_by_column = false) const;
|
||||
uint32_t GetNumBins() const {
|
||||
return nbins_;
|
||||
}
|
||||
|
||||
@ -59,7 +59,8 @@ class HistogramBuilder {
|
||||
GHistIndexMatrix const &gidx,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
common::RowSetCollection const &row_set_collection,
|
||||
const std::vector<GradientPair> &gpair_h) {
|
||||
const std::vector<GradientPair> &gpair_h,
|
||||
bool force_read_by_column) {
|
||||
const size_t n_nodes = nodes_for_explicit_hist_build.size();
|
||||
CHECK_GT(n_nodes, 0);
|
||||
|
||||
@ -86,7 +87,8 @@ class HistogramBuilder {
|
||||
elem.begin + end_of_row_set, nid);
|
||||
auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
|
||||
if (rid_set.Size() != 0) {
|
||||
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist);
|
||||
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
|
||||
force_read_by_column);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -112,7 +114,8 @@ class HistogramBuilder {
|
||||
RegTree *p_tree, common::RowSetCollection const &row_set_collection,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
std::vector<GradientPair> const &gpair) {
|
||||
std::vector<GradientPair> const &gpair,
|
||||
bool force_read_by_column = false) {
|
||||
int starting_index = std::numeric_limits<int>::max();
|
||||
int sync_count = 0;
|
||||
if (page_id == 0) {
|
||||
@ -123,11 +126,13 @@ class HistogramBuilder {
|
||||
if (gidx.IsDense()) {
|
||||
this->BuildLocalHistograms<false>(page_id, space, gidx,
|
||||
nodes_for_explicit_hist_build,
|
||||
row_set_collection, gpair);
|
||||
row_set_collection, gpair,
|
||||
force_read_by_column);
|
||||
} else {
|
||||
this->BuildLocalHistograms<true>(page_id, space, gidx,
|
||||
nodes_for_explicit_hist_build,
|
||||
row_set_collection, gpair);
|
||||
row_set_collection, gpair,
|
||||
force_read_by_column);
|
||||
}
|
||||
|
||||
CHECK_GE(n_batches_, 1);
|
||||
@ -148,7 +153,8 @@ class HistogramBuilder {
|
||||
common::RowSetCollection const &row_set_collection,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
std::vector<GradientPair> const &gpair) {
|
||||
std::vector<GradientPair> const &gpair,
|
||||
bool force_read_by_column = false) {
|
||||
const size_t n_nodes = nodes_for_explicit_hist_build.size();
|
||||
// create space of size (# rows in each node)
|
||||
common::BlockedSpace2d space(
|
||||
@ -160,7 +166,7 @@ class HistogramBuilder {
|
||||
256);
|
||||
this->BuildHist(page_id, space, gidx, p_tree, row_set_collection,
|
||||
nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
|
||||
gpair);
|
||||
gpair, force_read_by_column);
|
||||
}
|
||||
|
||||
void SyncHistogramDistributed(
|
||||
|
||||
@ -12,7 +12,7 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
void TestEvaluateSplits() {
|
||||
void TestEvaluateSplits(bool force_read_by_column) {
|
||||
int static constexpr kRows = 8, kCols = 16;
|
||||
auto orig = omp_get_max_threads();
|
||||
int32_t n_threads = std::min(omp_get_max_threads(), 4);
|
||||
@ -44,7 +44,7 @@ void TestEvaluateSplits() {
|
||||
hist.AddHistRow(0);
|
||||
hist.AllocateAllData();
|
||||
hist_builder.template BuildHist<false>(row_gpairs, row_set_collection[0],
|
||||
gmat, hist[0]);
|
||||
gmat, hist[0], force_read_by_column);
|
||||
|
||||
// Compute total gradient for all data points
|
||||
GradientPairPrecise total_gpair;
|
||||
@ -84,7 +84,10 @@ void TestEvaluateSplits() {
|
||||
omp_set_num_threads(orig);
|
||||
}
|
||||
|
||||
TEST(HistEvaluator, Evaluate) { TestEvaluateSplits(); }
|
||||
TEST(HistEvaluator, Evaluate) {
|
||||
TestEvaluateSplits(false);
|
||||
TestEvaluateSplits(true);
|
||||
}
|
||||
|
||||
TEST(HistEvaluator, Apply) {
|
||||
RegTree tree;
|
||||
|
||||
@ -225,7 +225,7 @@ TEST(CPUHistogram, SyncHist) {
|
||||
TestSyncHist(false);
|
||||
}
|
||||
|
||||
void TestBuildHistogram(bool is_distributed) {
|
||||
void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {
|
||||
size_t constexpr kNRows = 8, kNCols = 16;
|
||||
int32_t constexpr kMaxBins = 4;
|
||||
auto p_fmat =
|
||||
@ -256,7 +256,7 @@ void TestBuildHistogram(bool is_distributed) {
|
||||
nodes_for_explicit_hist_build.push_back(node);
|
||||
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>({kMaxBins, 0.5})) {
|
||||
histogram.BuildHist(0, gidx, &tree, row_set_collection,
|
||||
nodes_for_explicit_hist_build, {}, gpair);
|
||||
nodes_for_explicit_hist_build, {}, gpair, force_read_by_column);
|
||||
}
|
||||
|
||||
// Check if number of histogram bins is correct
|
||||
@ -283,12 +283,15 @@ void TestBuildHistogram(bool is_distributed) {
|
||||
}
|
||||
|
||||
TEST(CPUHistogram, BuildHist) {
|
||||
TestBuildHistogram(true);
|
||||
TestBuildHistogram(false);
|
||||
TestBuildHistogram(true, false);
|
||||
TestBuildHistogram(false, false);
|
||||
TestBuildHistogram(true, true);
|
||||
TestBuildHistogram(false, true);
|
||||
|
||||
}
|
||||
|
||||
namespace {
|
||||
void TestHistogramCategorical(size_t n_categories) {
|
||||
void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
|
||||
size_t constexpr kRows = 340;
|
||||
int32_t constexpr kBins = 256;
|
||||
auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories);
|
||||
@ -318,7 +321,8 @@ void TestHistogramCategorical(size_t n_categories) {
|
||||
auto total_bins = gidx.cut.TotalBins();
|
||||
cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false);
|
||||
cat_hist.BuildHist(0, gidx, &tree, row_set_collection,
|
||||
nodes_for_explicit_hist_build, {}, gpair.HostVector());
|
||||
nodes_for_explicit_hist_build, {}, gpair.HostVector(),
|
||||
force_read_by_column);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -331,7 +335,8 @@ void TestHistogramCategorical(size_t n_categories) {
|
||||
auto total_bins = gidx.cut.TotalBins();
|
||||
onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false);
|
||||
onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
|
||||
gpair.HostVector());
|
||||
gpair.HostVector(),
|
||||
force_read_by_column);
|
||||
}
|
||||
|
||||
auto cat = cat_hist.Histogram()[0];
|
||||
@ -342,11 +347,14 @@ void TestHistogramCategorical(size_t n_categories) {
|
||||
|
||||
TEST(CPUHistogram, Categorical) {
|
||||
for (size_t n_categories = 2; n_categories < 8; ++n_categories) {
|
||||
TestHistogramCategorical(n_categories);
|
||||
TestHistogramCategorical(n_categories, false);
|
||||
}
|
||||
for (size_t n_categories = 2; n_categories < 8; ++n_categories) {
|
||||
TestHistogramCategorical(n_categories, true);
|
||||
}
|
||||
}
|
||||
namespace {
|
||||
void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) {
|
||||
void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool force_read_by_column) {
|
||||
size_t constexpr kEntries = 1 << 16;
|
||||
auto m = CreateSparsePageDMatrix(kEntries, "cache");
|
||||
|
||||
@ -394,7 +402,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) {
|
||||
size_t page_idx{0};
|
||||
for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
|
||||
multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {},
|
||||
h_gpair);
|
||||
h_gpair, force_read_by_column);
|
||||
++page_idx;
|
||||
}
|
||||
ASSERT_EQ(page_idx, 2);
|
||||
@ -421,7 +429,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) {
|
||||
false, hess);
|
||||
GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
|
||||
std::numeric_limits<double>::quiet_NaN(), common::OmpGetNumThreads(0));
|
||||
single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair);
|
||||
single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column);
|
||||
single_page = single_build.Histogram()[0];
|
||||
}
|
||||
|
||||
@ -434,12 +442,15 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) {
|
||||
|
||||
TEST(CPUHistogram, ExternalMemory) {
|
||||
int32_t constexpr kBins = 256;
|
||||
TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true);
|
||||
TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true, false);
|
||||
TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true, true);
|
||||
|
||||
float sparse_thresh{0.5};
|
||||
TestHistogramExternalMemory({kBins, sparse_thresh}, false);
|
||||
TestHistogramExternalMemory({kBins, sparse_thresh}, false, false);
|
||||
TestHistogramExternalMemory({kBins, sparse_thresh}, false, true);
|
||||
sparse_thresh = std::numeric_limits<float>::quiet_NaN();
|
||||
TestHistogramExternalMemory({kBins, sparse_thresh}, false);
|
||||
TestHistogramExternalMemory({kBins, sparse_thresh}, false, false);
|
||||
TestHistogramExternalMemory({kBins, sparse_thresh}, false, true);
|
||||
|
||||
}
|
||||
} // namespace tree
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user