xgboost/src/common/hist_util.h
Philip Cho 14fba01b5a Improve multi-threaded performance (#2104)
* Add UpdatePredictionCache() option to updaters

Some updaters (e.g. fast_hist) has enough information to quickly compute
prediction cache for the training data. Each updater may override
UpdaterPredictionCache() method to update the prediction cache. Note: this
trick does not apply to validation data.

* Respond to code review

* Disable some debug messages by default
* Document UpdatePredictionCache() interface
* Remove base_margin logic from UpdatePredictionCache() implementation
* Do not take pointer to cfg, as reference may get stale

* Improve multi-threaded performance

* Use columnwise accessor to accelerate ApplySplit() step,
  with support for a compressed representation
* Parallel sort for evaluation step
* Inline BuildHist() function
* Cache gradient pairs when building histograms in BuildHist()

* Add missing #if macro

* Respond to code review

* Use wrapper to enable parallel sort on Linux

* Fix C++ compatibility issues

* MSVC doesn't support unsigned in OpenMP loops
* gcc 4.6 doesn't support using keyword

* Fix lint issues

* Respond to code review

* Fix bug in ApplySplitSparseData()

* Attempting to read beyond the end of a sparse column
* Mishandling the case where an entire range of rows have missing values

* Fix training continuation bug

Disable UpdatePredictionCache() in the first iteration. This way, we can
accomodate the scenario where we build off of an existing (nonempty) ensemble.

* Add regression test for fast_hist

* Respond to code review

* Add back old version of ApplySplitSparseData
2017-03-25 10:35:01 -07:00

225 lines
6.3 KiB
C++

/*!
* Copyright 2017 by Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
*/
#ifndef XGBOOST_COMMON_HIST_UTIL_H_
#define XGBOOST_COMMON_HIST_UTIL_H_
#include <xgboost/data.h>
#include <limits>
#include <vector>
#include "row_set.h"
namespace xgboost {
namespace common {
/*! \brief sums of gradient statistics corresponding to a histogram bin */
struct GHistEntry {
/*! \brief sum of first-order gradient statistics */
double sum_grad;
/*! \brief sum of second-order gradient statistics */
double sum_hess;
GHistEntry() : sum_grad(0), sum_hess(0) {}
/*! \brief add a bst_gpair to the sum */
inline void Add(const bst_gpair& e) {
sum_grad += e.grad;
sum_hess += e.hess;
}
/*! \brief add a GHistEntry to the sum */
inline void Add(const GHistEntry& e) {
sum_grad += e.sum_grad;
sum_hess += e.sum_hess;
}
/*! \brief set sum to be difference of two GHistEntry's */
inline void SetSubtract(const GHistEntry& a, const GHistEntry& b) {
sum_grad = a.sum_grad - b.sum_grad;
sum_hess = a.sum_hess - b.sum_hess;
}
};
/*! \brief Cut configuration for one feature */
struct HistCutUnit {
/*! \brief the index pointer of each histunit */
const bst_float* cut;
/*! \brief number of cutting point, containing the maximum point */
size_t size;
// default constructor
HistCutUnit() {}
// constructor
HistCutUnit(const bst_float* cut, unsigned size)
: cut(cut), size(size) {}
};
/*! \brief cut configuration for all the features */
struct HistCutMatrix {
/*! \brief actual unit pointer */
std::vector<unsigned> row_ptr;
/*! \brief minimum value of each feature */
std::vector<bst_float> min_val;
/*! \brief the cut field */
std::vector<bst_float> cut;
/*! \brief Get histogram bound for fid */
inline HistCutUnit operator[](unsigned fid) const {
return HistCutUnit(dmlc::BeginPtr(cut) + row_ptr[fid],
row_ptr[fid + 1] - row_ptr[fid]);
}
// create histogram cut matrix given statistics from data
// using approximate quantile sketch approach
void Init(DMatrix* p_fmat, size_t max_num_bins);
};
/*!
* \brief A single row in global histogram index.
* Directly represent the global index in the histogram entry.
*/
struct GHistIndexRow {
/*! \brief The index of the histogram */
const unsigned* index;
/*! \brief The size of the histogram */
unsigned size;
GHistIndexRow() {}
GHistIndexRow(const unsigned* index, unsigned size)
: index(index), size(size) {}
};
/*!
* \brief preprocessed global index matrix, in CSR format
* Transform floating values to integer index in histogram
* This is a global histogram index.
*/
struct GHistIndexMatrix {
/*! \brief row pointer */
std::vector<unsigned> row_ptr;
/*! \brief The index data */
std::vector<unsigned> index;
/*! \brief hit count of each index */
std::vector<unsigned> hit_count;
/*! \brief The corresponding cuts */
const HistCutMatrix* cut;
// Create a global histogram matrix, given cut
void Init(DMatrix* p_fmat);
// get i-th row
inline GHistIndexRow operator[](bst_uint i) const {
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
}
inline void GetFeatureCounts(bst_uint* counts) const {
const unsigned nfeature = cut->row_ptr.size() - 1;
for (unsigned fid = 0; fid < nfeature; ++fid) {
const unsigned ibegin = cut->row_ptr[fid];
const unsigned iend = cut->row_ptr[fid + 1];
for (unsigned i = ibegin; i < iend; ++i) {
counts[fid] += hit_count[i];
}
}
}
private:
std::vector<unsigned> hit_count_tloc_;
};
/*!
* \brief histogram of graident statistics for a single node.
* Consists of multiple GHistEntry's, each entry showing total graident statistics
* for that particular bin
* Uses global bin id so as to represent all features simultaneously
*/
struct GHistRow {
/*! \brief base pointer to first entry */
GHistEntry* begin;
/*! \brief number of entries */
unsigned size;
GHistRow() {}
GHistRow(GHistEntry* begin, unsigned size)
: begin(begin), size(size) {}
};
/*!
* \brief histogram of gradient statistics for multiple nodes
*/
class HistCollection {
public:
// access histogram for i-th node
inline GHistRow operator[](bst_uint nid) const {
const size_t kMax = std::numeric_limits<size_t>::max();
CHECK_NE(row_ptr_[nid], kMax);
return GHistRow(const_cast<GHistEntry*>(dmlc::BeginPtr(data_) + row_ptr_[nid]), nbins_);
}
// have we computed a histogram for i-th node?
inline bool RowExists(bst_uint nid) const {
const size_t kMax = std::numeric_limits<size_t>::max();
return (nid < row_ptr_.size() && row_ptr_[nid] != kMax);
}
// initialize histogram collection
inline void Init(size_t nbins) {
nbins_ = nbins;
row_ptr_.clear();
data_.clear();
}
// create an empty histogram for i-th node
inline void AddHistRow(bst_uint nid) {
const size_t kMax = std::numeric_limits<size_t>::max();
if (nid >= row_ptr_.size()) {
row_ptr_.resize(nid + 1, kMax);
}
CHECK_EQ(row_ptr_[nid], kMax);
row_ptr_[nid] = data_.size();
data_.resize(data_.size() + nbins_);
}
private:
/*! \brief number of all bins over all features */
size_t nbins_;
std::vector<GHistEntry> data_;
/*! \brief row_ptr_[nid] locates bin for historgram of node nid */
std::vector<size_t> row_ptr_;
};
/*!
* \brief builder for histograms of gradient statistics
*/
class GHistBuilder {
public:
// initialize builder
inline void Init(size_t nthread, size_t nbins) {
nthread_ = nthread;
nbins_ = nbins;
}
// construct a histogram via histogram aggregation
void BuildHist(const std::vector<bst_gpair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const std::vector<bst_uint>& feat_set,
GHistRow hist);
// construct a histogram via subtraction trick
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
private:
/*! \brief number of threads for parallel computation */
size_t nthread_;
/*! \brief number of all bins over all features */
size_t nbins_;
std::vector<GHistEntry> data_;
std::vector<bst_gpair> stat_buf_;
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_HIST_UTIL_H_