Optimizations of pre-processing for 'hist' tree method (#4310)

* oprimizations for pre-processing

* code cleaning

* code cleaning

* code cleaning after review

* Apply suggestions from code review

Co-Authored-By: SmirnovEgorRu <egor.smirnov@intel.com>
This commit is contained in:
Egor Smirnov
2019-04-17 03:36:19 +03:00
committed by Philip Hyunsu Cho
parent 207f058711
commit 711397d645
6 changed files with 299 additions and 48 deletions

View File

@@ -71,7 +71,7 @@ class ColumnMatrix {
// construct column matrix from GHistIndexMatrix
inline void Init(const GHistIndexMatrix& gmat,
double sparse_threshold) {
const auto nfeature = static_cast<bst_uint>(gmat.cut.row_ptr.size() - 1);
const int32_t nfeature = static_cast<int32_t>(gmat.cut.row_ptr.size() - 1);
const size_t nrow = gmat.row_ptr.size() - 1;
// identify type of each column
@@ -86,7 +86,7 @@ class ColumnMatrix {
gmat.GetFeatureCounts(&feature_counts_[0]);
// classify features
for (bst_uint fid = 0; fid < nfeature; ++fid) {
for (int32_t fid = 0; fid < nfeature; ++fid) {
if (static_cast<double>(feature_counts_[fid])
< sparse_threshold * nrow) {
type_[fid] = kSparseColumn;
@@ -100,7 +100,7 @@ class ColumnMatrix {
boundary_.resize(nfeature);
size_t accum_index_ = 0;
size_t accum_row_ind_ = 0;
for (bst_uint fid = 0; fid < nfeature; ++fid) {
for (int32_t fid = 0; fid < nfeature; ++fid) {
boundary_[fid].index_begin = accum_index_;
boundary_[fid].row_ind_begin = accum_row_ind_;
if (type_[fid] == kDenseColumn) {
@@ -124,7 +124,9 @@ class ColumnMatrix {
}
// pre-fill index_ for dense columns
for (bst_uint fid = 0; fid < nfeature; ++fid) {
#pragma omp parallel for
for (int32_t fid = 0; fid < nfeature; ++fid) {
if (type_[fid] == kDenseColumn) {
const size_t ibegin = boundary_[fid].index_begin;
uint32_t* begin = &index_[ibegin];
@@ -184,8 +186,8 @@ class ColumnMatrix {
std::vector<size_t> feature_counts_;
std::vector<ColumnType> type_;
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
std::vector<size_t> row_ind_;
SimpleArray<uint32_t> index_; // index_: may store smaller integers; needs padding
SimpleArray<size_t> row_ind_;
std::vector<ColumnBoundary> boundary_;
// index_base_[fid]: least bin id for feature fid