Improve multi-threaded performance (#2104)
* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
This commit is contained in:
231
src/common/column_matrix.h
Normal file
231
src/common/column_matrix.h
Normal file
@@ -0,0 +1,231 @@
|
||||
/*!
|
||||
* Copyright 2017 by Contributors
|
||||
* \file column_matrix.h
|
||||
* \brief Utility for fast column-wise access
|
||||
* \author Philip Cho
|
||||
*/
|
||||
|
||||
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
|
||||
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
|
||||
switch (dtype) { \
|
||||
case xgboost::common::uint32 : { \
|
||||
typedef uint32_t DType; \
|
||||
OP; break; \
|
||||
} \
|
||||
case xgboost::common::uint16 : { \
|
||||
typedef uint16_t DType; \
|
||||
OP; break; \
|
||||
} \
|
||||
case xgboost::common::uint8 : { \
|
||||
typedef uint8_t DType; \
|
||||
OP; break; \
|
||||
default: LOG(FATAL) << "don't recognize type flag" << dtype; \
|
||||
} \
|
||||
}
|
||||
|
||||
#include <type_traits>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include "hist_util.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
/*! \brief indicator of data type used for storing bin id's in a column. */
|
||||
enum DataType {
|
||||
uint8 = 1,
|
||||
uint16 = 2,
|
||||
uint32 = 4
|
||||
};
|
||||
|
||||
/*! \brief column type */
|
||||
enum ColumnType {
|
||||
kDenseColumn,
|
||||
kSparseColumn
|
||||
};
|
||||
|
||||
/*! \brief a column storage, to be used with ApplySplit. Note that each
|
||||
bin id is stored as index[i] + index_base. */
|
||||
template<typename T>
|
||||
class Column {
|
||||
public:
|
||||
ColumnType type;
|
||||
const T* index;
|
||||
uint32_t index_base;
|
||||
const uint32_t* row_ind;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
/*! \brief a collection of columns, with support for construction from
|
||||
GHistIndexMatrix. */
|
||||
class ColumnMatrix {
|
||||
public:
|
||||
// get number of features
|
||||
inline uint32_t GetNumFeature() const {
|
||||
return type_.size();
|
||||
}
|
||||
|
||||
// construct column matrix from GHistIndexMatrix
|
||||
inline void Init(const GHistIndexMatrix& gmat, DataType dtype) {
|
||||
this->dtype = dtype;
|
||||
/* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
|
||||
slot of internal buffer. */
|
||||
packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
|
||||
|
||||
const uint32_t nfeature = gmat.cut->row_ptr.size() - 1;
|
||||
const omp_ulong nrow = static_cast<omp_ulong>(gmat.row_ptr.size() - 1);
|
||||
|
||||
// identify type of each column
|
||||
feature_counts_.resize(nfeature);
|
||||
type_.resize(nfeature);
|
||||
std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
|
||||
|
||||
uint32_t max_val = 0;
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
max_val = static_cast<uint32_t>(std::numeric_limits<DType>::max());
|
||||
});
|
||||
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
||||
CHECK_LE(gmat.cut->row_ptr[fid + 1] - gmat.cut->row_ptr[fid], max_val);
|
||||
}
|
||||
|
||||
gmat.GetFeatureCounts(&feature_counts_[0]);
|
||||
// classify features
|
||||
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
||||
if (static_cast<double>(feature_counts_[fid]) < 0.5*nrow) {
|
||||
type_[fid] = kSparseColumn;
|
||||
} else {
|
||||
type_[fid] = kDenseColumn;
|
||||
}
|
||||
}
|
||||
|
||||
// want to compute storage boundary for each feature
|
||||
// using variants of prefix sum scan
|
||||
boundary_.resize(nfeature);
|
||||
bst_uint accum_index_ = 0;
|
||||
bst_uint accum_row_ind_ = 0;
|
||||
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
||||
boundary_[fid].index_begin = accum_index_;
|
||||
boundary_[fid].row_ind_begin = accum_row_ind_;
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
accum_index_ += nrow;
|
||||
} else {
|
||||
accum_index_ += feature_counts_[fid];
|
||||
accum_row_ind_ += feature_counts_[fid];
|
||||
}
|
||||
boundary_[fid].index_end = accum_index_;
|
||||
boundary_[fid].row_ind_end = accum_row_ind_;
|
||||
}
|
||||
|
||||
index_.resize((boundary_[nfeature - 1].index_end
|
||||
+ (packing_factor_ - 1)) / packing_factor_);
|
||||
row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
|
||||
|
||||
// store least bin id for each feature
|
||||
index_base_.resize(nfeature);
|
||||
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
||||
index_base_[fid] = gmat.cut->row_ptr[fid];
|
||||
}
|
||||
|
||||
// fill index_ for dense columns
|
||||
for (uint32_t fid = 0; fid < nfeature; ++fid) {
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
const uint32_t ibegin = boundary_[fid].index_begin;
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
const size_t block_offset = ibegin / packing_factor_;
|
||||
const size_t elem_offset = ibegin % packing_factor_;
|
||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
||||
DType* end = begin + nrow;
|
||||
std::fill(begin, end, std::numeric_limits<DType>::max());
|
||||
// max() indicates missing values
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// loop over all rows and fill column entries
|
||||
// num_nonzeros[fid] = how many nonzeros have this feature accumulated so far?
|
||||
std::vector<uint32_t> num_nonzeros;
|
||||
num_nonzeros.resize(nfeature);
|
||||
std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
|
||||
for (uint32_t rid = 0; rid < nrow; ++rid) {
|
||||
const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
|
||||
const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
|
||||
size_t fid = 0;
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
const size_t bin_id = gmat.index[i];
|
||||
while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
|
||||
++fid;
|
||||
}
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
||||
begin[rid] = bin_id - index_base_[fid];
|
||||
});
|
||||
} else {
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
||||
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
|
||||
});
|
||||
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
|
||||
++num_nonzeros[fid];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
|
||||
to determine type of bin id's */
|
||||
template<typename T>
|
||||
inline Column<T> GetColumn(unsigned fid) const {
|
||||
const bool valid_type = std::is_same<T, uint32_t>::value
|
||||
|| std::is_same<T, uint16_t>::value
|
||||
|| std::is_same<T, uint8_t>::value;
|
||||
CHECK(valid_type);
|
||||
|
||||
Column<T> c;
|
||||
|
||||
c.type = type_[fid];
|
||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
||||
c.index = reinterpret_cast<const T*>(&index_[block_offset]) + elem_offset;
|
||||
c.index_base = index_base_[fid];
|
||||
c.row_ind = &row_ind_[boundary_[fid].row_ind_begin];
|
||||
c.len = boundary_[fid].index_end - boundary_[fid].index_begin;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
public:
|
||||
DataType dtype;
|
||||
|
||||
private:
|
||||
struct ColumnBoundary {
|
||||
// indicate where each column's index and row_ind is stored.
|
||||
// index_begin and index_end are logical offsets, so they should be converted to
|
||||
// actual offsets by scaling with packing_factor_
|
||||
unsigned index_begin;
|
||||
unsigned index_end;
|
||||
unsigned row_ind_begin;
|
||||
unsigned row_ind_end;
|
||||
};
|
||||
|
||||
std::vector<bst_uint> feature_counts_;
|
||||
std::vector<ColumnType> type_;
|
||||
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
|
||||
std::vector<uint32_t> row_ind_;
|
||||
std::vector<ColumnBoundary> boundary_;
|
||||
|
||||
size_t packing_factor_; // how many integers are stored in each slot of index_
|
||||
|
||||
// index_base_[fid]: least bin id for feature fid
|
||||
std::vector<uint32_t> index_base_;
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <vector>
|
||||
#include "./sync.h"
|
||||
#include "./hist_util.h"
|
||||
#include "./column_matrix.h"
|
||||
#include "./quantile.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -21,12 +22,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
|
||||
const int kFactor = 8;
|
||||
std::vector<WXQSketch> sketchs;
|
||||
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
nthread = std::max(nthread / 2, 1);
|
||||
const int nthread = omp_get_max_threads();
|
||||
|
||||
unsigned nstep = (info.num_col + nthread - 1) / nthread;
|
||||
unsigned ncol = static_cast<unsigned>(info.num_col);
|
||||
@@ -105,18 +101,14 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
CHECK(cut != nullptr);
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
hit_count.resize(cut->row_ptr.back(), 0);
|
||||
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
nthread = std::max(nthread / 2, 1);
|
||||
const int nthread = omp_get_max_threads();
|
||||
const unsigned nbins = cut->row_ptr.back();
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_tloc_.resize(nthread * nbins, 0);
|
||||
|
||||
iter->BeforeFirst();
|
||||
row_ptr.push_back(0);
|
||||
@@ -134,6 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
omp_ulong bsize = static_cast<omp_ulong>(batch.size);
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
|
||||
const int tid = omp_get_thread_num();
|
||||
size_t ibegin = row_ptr[rbegin + i];
|
||||
size_t iend = row_ptr[rbegin + i + 1];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
@@ -147,20 +140,28 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
if (it == cend) it = cend - 1;
|
||||
unsigned idx = static_cast<unsigned>(it - cut->cut.begin());
|
||||
index[ibegin + j] = idx;
|
||||
++hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
std::sort(index.begin() + ibegin, index.begin() + iend);
|
||||
}
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (omp_ulong idx = 0; idx < nbins; ++idx) {
|
||||
for (int tid = 0; tid < nthread; ++tid) {
|
||||
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist) {
|
||||
CHECK(!data_.empty()) << "GHistBuilder must be initialized";
|
||||
CHECK_EQ(data_.size(), nbins_ * nthread_) << "invalid dimensions for temp buffer";
|
||||
|
||||
data_.resize(nbins_ * nthread_, GHistEntry());
|
||||
std::fill(data_.begin(), data_.end(), GHistEntry());
|
||||
stat_buf_.resize(row_indices.size());
|
||||
|
||||
const int K = 8; // loop unrolling factor
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
@@ -169,21 +170,42 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
const bst_omp_uint tid = omp_get_thread_num();
|
||||
const size_t off = tid * nbins_;
|
||||
bst_uint rid[K];
|
||||
bst_gpair stat[K];
|
||||
size_t ibegin[K], iend[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
stat_buf_[i + k] = stat[k];
|
||||
}
|
||||
}
|
||||
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
|
||||
const bst_uint rid = row_indices.begin[i];
|
||||
const bst_gpair stat = gpair[rid];
|
||||
stat_buf_[i] = stat;
|
||||
}
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
const bst_omp_uint tid = omp_get_thread_num();
|
||||
const size_t off = tid * nbins_;
|
||||
bst_uint rid[K];
|
||||
size_t ibegin[K];
|
||||
size_t iend[K];
|
||||
bst_gpair stat[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
|
||||
iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
stat[k] = stat_buf_[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const size_t bin = gmat.index[j];
|
||||
@@ -193,9 +215,9 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
|
||||
const bst_uint rid = row_indices.begin[i];
|
||||
const bst_gpair stat = gpair[rid];
|
||||
const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
|
||||
const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
|
||||
const bst_gpair stat = stat_buf_[i];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const size_t bin = gmat.index[j];
|
||||
data_[bin].Add(stat);
|
||||
@@ -212,13 +234,26 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
}
|
||||
|
||||
void GHistBuilder::SubtractionTrick(GHistRow self,
|
||||
GHistRow sibling,
|
||||
GHistRow parent) {
|
||||
void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
|
||||
const int K = 8;
|
||||
const bst_omp_uint rest = nbins % K;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
|
||||
for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
|
||||
GHistEntry pb[K];
|
||||
GHistEntry sb[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
pb[k] = parent.begin[bin_id + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
sb[k] = sibling.begin[bin_id + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
|
||||
}
|
||||
}
|
||||
for (bst_omp_uint bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
|
||||
self.begin[bin_id].SetSubtract(parent.begin[bin_id], sibling.begin[bin_id]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,18 +102,27 @@ struct GHistIndexMatrix {
|
||||
std::vector<unsigned> index;
|
||||
/*! \brief hit count of each index */
|
||||
std::vector<unsigned> hit_count;
|
||||
/*! \brief optional remap index from outter row_id -> internal row_id*/
|
||||
std::vector<unsigned> remap_index;
|
||||
/*! \brief The corresponding cuts */
|
||||
const HistCutMatrix* cut;
|
||||
// Create a global histogram matrix, given cut
|
||||
void Init(DMatrix* p_fmat);
|
||||
// build remap
|
||||
void Remap();
|
||||
// get i-th row
|
||||
inline GHistIndexRow operator[](bst_uint i) const {
|
||||
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
|
||||
}
|
||||
inline void GetFeatureCounts(bst_uint* counts) const {
|
||||
const unsigned nfeature = cut->row_ptr.size() - 1;
|
||||
for (unsigned fid = 0; fid < nfeature; ++fid) {
|
||||
const unsigned ibegin = cut->row_ptr[fid];
|
||||
const unsigned iend = cut->row_ptr[fid + 1];
|
||||
for (unsigned i = ibegin; i < iend; ++i) {
|
||||
counts[fid] += hit_count[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<unsigned> hit_count_tloc_;
|
||||
};
|
||||
|
||||
/*!
|
||||
@@ -189,13 +198,13 @@ class GHistBuilder {
|
||||
inline void Init(size_t nthread, size_t nbins) {
|
||||
nthread_ = nthread;
|
||||
nbins_ = nbins;
|
||||
data_.resize(nthread * nbins, GHistEntry());
|
||||
}
|
||||
|
||||
// construct a histogram via histogram aggregation
|
||||
void BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist);
|
||||
// construct a histogram via subtraction trick
|
||||
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
|
||||
@@ -206,6 +215,7 @@ class GHistBuilder {
|
||||
/*! \brief number of all bins over all features */
|
||||
size_t nbins_;
|
||||
std::vector<GHistEntry> data_;
|
||||
std::vector<bst_gpair> stat_buf_;
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -17,15 +17,20 @@ namespace common {
|
||||
/*! \brief collection of rowset */
|
||||
class RowSetCollection {
|
||||
public:
|
||||
/*! \brief subset of rows */
|
||||
/*! \brief data structure to store an instance set, a subset of
|
||||
* rows (instances) associated with a particular node in a decision
|
||||
* tree. */
|
||||
struct Elem {
|
||||
const bst_uint* begin;
|
||||
const bst_uint* end;
|
||||
int node_id;
|
||||
// id of node associated with this instance set; -1 means uninitialized
|
||||
Elem(void)
|
||||
: begin(nullptr), end(nullptr) {}
|
||||
: begin(nullptr), end(nullptr), node_id(-1) {}
|
||||
Elem(const bst_uint* begin,
|
||||
const bst_uint* end)
|
||||
: begin(begin), end(end) {}
|
||||
const bst_uint* end,
|
||||
int node_id)
|
||||
: begin(begin), end(end), node_id(node_id) {}
|
||||
|
||||
inline size_t size() const {
|
||||
return end - begin;
|
||||
@@ -36,6 +41,15 @@ class RowSetCollection {
|
||||
std::vector<bst_uint> left;
|
||||
std::vector<bst_uint> right;
|
||||
};
|
||||
|
||||
inline std::vector<Elem>::const_iterator begin() const {
|
||||
return elem_of_each_node_.begin();
|
||||
}
|
||||
|
||||
inline std::vector<Elem>::const_iterator end() const {
|
||||
return elem_of_each_node_.end();
|
||||
}
|
||||
|
||||
/*! \brief return corresponding element set given the node_id */
|
||||
inline const Elem& operator[](unsigned node_id) const {
|
||||
const Elem& e = elem_of_each_node_[node_id];
|
||||
@@ -53,7 +67,7 @@ class RowSetCollection {
|
||||
CHECK_EQ(elem_of_each_node_.size(), 0U);
|
||||
const bst_uint* begin = dmlc::BeginPtr(row_indices_);
|
||||
const bst_uint* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
|
||||
elem_of_each_node_.emplace_back(Elem(begin, end));
|
||||
elem_of_each_node_.emplace_back(Elem(begin, end, 0));
|
||||
}
|
||||
// split rowset into two
|
||||
inline void AddSplit(unsigned node_id,
|
||||
@@ -79,15 +93,15 @@ class RowSetCollection {
|
||||
}
|
||||
|
||||
if (left_node_id >= elem_of_each_node_.size()) {
|
||||
elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr));
|
||||
elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
|
||||
}
|
||||
if (right_node_id >= elem_of_each_node_.size()) {
|
||||
elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr));
|
||||
elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
|
||||
}
|
||||
|
||||
elem_of_each_node_[left_node_id] = Elem(begin, split_pt);
|
||||
elem_of_each_node_[right_node_id] = Elem(split_pt, e.end);
|
||||
elem_of_each_node_[node_id] = Elem(nullptr, nullptr);
|
||||
elem_of_each_node_[left_node_id] = Elem(begin, split_pt, left_node_id);
|
||||
elem_of_each_node_[right_node_id] = Elem(split_pt, e.end, right_node_id);
|
||||
elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
|
||||
}
|
||||
|
||||
// stores the row indices in the set
|
||||
|
||||
Reference in New Issue
Block a user