Optimized ApplySplit, BuildHist and UpdatePredictCache functions on CPU (#5244)

* Split up sparse and dense build hist kernels.
* Add `PartitionBuilder`.
This commit is contained in:
Egor Smirnov
2020-02-29 11:11:42 +03:00
committed by GitHub
parent b81f8cbbc0
commit 1b97eaf7a7
9 changed files with 694 additions and 387 deletions

View File

@@ -672,7 +672,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
}
/*!
* \brief fill a histogram by zeroes
* \brief fill a histogram by zeros in range [begin, end)
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
memset(hist.data() + begin, '\0', (end-begin)*sizeof(tree::GradStats));
@@ -719,40 +719,141 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
}
}
struct Prefetch {
public:
static constexpr size_t kCacheLineSize = 64;
static constexpr size_t kPrefetchOffset = 10;
static constexpr size_t kPrefetchStep =
kCacheLineSize / sizeof(decltype(GHistIndexMatrix::index)::value_type);
private:
static constexpr size_t kNoPrefetchSize =
kPrefetchOffset + kCacheLineSize /
sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);
public:
static size_t NoPrefetchSize(size_t rows) {
return std::min(rows, kNoPrefetchSize);
}
};
constexpr size_t Prefetch::kNoPrefetchSize;
template<typename FPType, bool do_prefetch>
void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const size_t n_features,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const uint32_t* gradient_index = gmat.index.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t i = 0; i < size; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = two * rid[i];
if (do_prefetch) {
const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}
for (size_t j = icol_start; j < icol_start + n_features; ++j) {
const uint32_t idx_bin = two * gradient_index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}
template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const uint32_t* gradient_index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t i = 0; i < size; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = two * rid[i];
if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}
for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = two * gradient_index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}
template<typename FPType, bool do_prefetch>
void BuildHistKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, const bool isDense, GHistRow hist) {
if (row_indices.Size() && isDense) {
const size_t* row_ptr = gmat.row_ptr.data();
const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
BuildHistDenseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, n_features, hist);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, hist);
}
}
void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t* rid = row_indices.begin;
GHistRow hist,
bool isDense) {
using FPType = decltype(tree::GradStats::sum_grad);
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);
double* hist_data = reinterpret_cast<double*>(hist.data());
// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);
const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<FPType, false>(gpair, row_indices, gmat, isDense, hist);
} else {
const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);
for (size_t i = 0; i < nrows; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
}
for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
BuildHistKernel<FPType, true>(gpair, span1, gmat, isDense, hist);
// no prefetching to avoid loading extra memory
BuildHistKernel<FPType, false>(gpair, span2, gmat, isDense, hist);
}
}