* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
232 lines
7.6 KiB
C++
232 lines
7.6 KiB
C++
/*!
|
|
* Copyright 2017 by Contributors
|
|
* \file column_matrix.h
|
|
* \brief Utility for fast column-wise access
|
|
* \author Philip Cho
|
|
*/
|
|
|
|
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
|
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
|
|
|
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
|
|
switch (dtype) { \
|
|
case xgboost::common::uint32 : { \
|
|
typedef uint32_t DType; \
|
|
OP; break; \
|
|
} \
|
|
case xgboost::common::uint16 : { \
|
|
typedef uint16_t DType; \
|
|
OP; break; \
|
|
} \
|
|
case xgboost::common::uint8 : { \
|
|
typedef uint8_t DType; \
|
|
OP; break; \
|
|
default: LOG(FATAL) << "don't recognize type flag" << dtype; \
|
|
} \
|
|
}
|
|
|
|
#include <type_traits>
|
|
#include <limits>
|
|
#include <vector>
|
|
#include "hist_util.h"
|
|
|
|
namespace xgboost {
|
|
namespace common {
|
|
|
|
/*! \brief indicator of data type used for storing bin id's in a column. */
|
|
enum DataType {
|
|
uint8 = 1,
|
|
uint16 = 2,
|
|
uint32 = 4
|
|
};
|
|
|
|
/*! \brief column type */
|
|
enum ColumnType {
|
|
kDenseColumn,
|
|
kSparseColumn
|
|
};
|
|
|
|
/*! \brief a column storage, to be used with ApplySplit. Note that each
|
|
bin id is stored as index[i] + index_base. */
|
|
template<typename T>
|
|
class Column {
|
|
public:
|
|
ColumnType type;
|
|
const T* index;
|
|
uint32_t index_base;
|
|
const uint32_t* row_ind;
|
|
size_t len;
|
|
};
|
|
|
|
/*! \brief a collection of columns, with support for construction from
|
|
GHistIndexMatrix. */
|
|
class ColumnMatrix {
|
|
public:
|
|
// get number of features
|
|
inline uint32_t GetNumFeature() const {
|
|
return type_.size();
|
|
}
|
|
|
|
// construct column matrix from GHistIndexMatrix
|
|
inline void Init(const GHistIndexMatrix& gmat, DataType dtype) {
|
|
this->dtype = dtype;
|
|
/* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
|
|
slot of internal buffer. */
|
|
packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
|
|
|
|
const uint32_t nfeature = gmat.cut->row_ptr.size() - 1;
|
|
const omp_ulong nrow = static_cast<omp_ulong>(gmat.row_ptr.size() - 1);
|
|
|
|
// identify type of each column
|
|
feature_counts_.resize(nfeature);
|
|
type_.resize(nfeature);
|
|
std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
|
|
|
|
uint32_t max_val = 0;
|
|
XGBOOST_TYPE_SWITCH(this->dtype, {
|
|
max_val = static_cast<uint32_t>(std::numeric_limits<DType>::max());
|
|
});
|
|
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
|
CHECK_LE(gmat.cut->row_ptr[fid + 1] - gmat.cut->row_ptr[fid], max_val);
|
|
}
|
|
|
|
gmat.GetFeatureCounts(&feature_counts_[0]);
|
|
// classify features
|
|
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
|
if (static_cast<double>(feature_counts_[fid]) < 0.5*nrow) {
|
|
type_[fid] = kSparseColumn;
|
|
} else {
|
|
type_[fid] = kDenseColumn;
|
|
}
|
|
}
|
|
|
|
// want to compute storage boundary for each feature
|
|
// using variants of prefix sum scan
|
|
boundary_.resize(nfeature);
|
|
bst_uint accum_index_ = 0;
|
|
bst_uint accum_row_ind_ = 0;
|
|
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
|
boundary_[fid].index_begin = accum_index_;
|
|
boundary_[fid].row_ind_begin = accum_row_ind_;
|
|
if (type_[fid] == kDenseColumn) {
|
|
accum_index_ += nrow;
|
|
} else {
|
|
accum_index_ += feature_counts_[fid];
|
|
accum_row_ind_ += feature_counts_[fid];
|
|
}
|
|
boundary_[fid].index_end = accum_index_;
|
|
boundary_[fid].row_ind_end = accum_row_ind_;
|
|
}
|
|
|
|
index_.resize((boundary_[nfeature - 1].index_end
|
|
+ (packing_factor_ - 1)) / packing_factor_);
|
|
row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
|
|
|
|
// store least bin id for each feature
|
|
index_base_.resize(nfeature);
|
|
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
|
index_base_[fid] = gmat.cut->row_ptr[fid];
|
|
}
|
|
|
|
// fill index_ for dense columns
|
|
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
|
if (type_[fid] == kDenseColumn) {
|
|
const uint32_t ibegin = boundary_[fid].index_begin;
|
|
XGBOOST_TYPE_SWITCH(this->dtype, {
|
|
const size_t block_offset = ibegin / packing_factor_;
|
|
const size_t elem_offset = ibegin % packing_factor_;
|
|
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
|
DType* end = begin + nrow;
|
|
std::fill(begin, end, std::numeric_limits<DType>::max());
|
|
// max() indicates missing values
|
|
});
|
|
}
|
|
}
|
|
|
|
// loop over all rows and fill column entries
|
|
// num_nonzeros[fid] = how many nonzeros have this feature accumulated so far?
|
|
std::vector<uint32_t> num_nonzeros;
|
|
num_nonzeros.resize(nfeature);
|
|
std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
|
|
for (uint32_t rid = 0; rid < nrow; ++rid) {
|
|
const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
|
|
const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
|
|
size_t fid = 0;
|
|
for (size_t i = ibegin; i < iend; ++i) {
|
|
const size_t bin_id = gmat.index[i];
|
|
while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
|
|
++fid;
|
|
}
|
|
if (type_[fid] == kDenseColumn) {
|
|
XGBOOST_TYPE_SWITCH(this->dtype, {
|
|
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
|
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
|
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
|
begin[rid] = bin_id - index_base_[fid];
|
|
});
|
|
} else {
|
|
XGBOOST_TYPE_SWITCH(this->dtype, {
|
|
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
|
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
|
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
|
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
|
|
});
|
|
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
|
|
++num_nonzeros[fid];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
|
|
to determine type of bin id's */
|
|
template<typename T>
|
|
inline Column<T> GetColumn(unsigned fid) const {
|
|
const bool valid_type = std::is_same<T, uint32_t>::value
|
|
|| std::is_same<T, uint16_t>::value
|
|
|| std::is_same<T, uint8_t>::value;
|
|
CHECK(valid_type);
|
|
|
|
Column<T> c;
|
|
|
|
c.type = type_[fid];
|
|
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
|
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
|
c.index = reinterpret_cast<const T*>(&index_[block_offset]) + elem_offset;
|
|
c.index_base = index_base_[fid];
|
|
c.row_ind = &row_ind_[boundary_[fid].row_ind_begin];
|
|
c.len = boundary_[fid].index_end - boundary_[fid].index_begin;
|
|
|
|
return c;
|
|
}
|
|
|
|
public:
|
|
DataType dtype;
|
|
|
|
private:
|
|
struct ColumnBoundary {
|
|
// indicate where each column's index and row_ind is stored.
|
|
// index_begin and index_end are logical offsets, so they should be converted to
|
|
// actual offsets by scaling with packing_factor_
|
|
unsigned index_begin;
|
|
unsigned index_end;
|
|
unsigned row_ind_begin;
|
|
unsigned row_ind_end;
|
|
};
|
|
|
|
std::vector<bst_uint> feature_counts_;
|
|
std::vector<ColumnType> type_;
|
|
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
|
|
std::vector<uint32_t> row_ind_;
|
|
std::vector<ColumnBoundary> boundary_;
|
|
|
|
size_t packing_factor_; // how many integers are stored in each slot of index_
|
|
|
|
// index_base_[fid]: least bin id for feature fid
|
|
std::vector<uint32_t> index_base_;
|
|
};
|
|
|
|
} // namespace common
|
|
} // namespace xgboost
|
|
#endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
|