diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index fe172a7e9..d79084f92 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -1,5 +1,5 @@ /*! - * Copyright 2017-2020 by Contributors + * Copyright 2017-2021 by Contributors * \file updater_quantile_hist.cc * \brief use quantized feature values to construct a tree * \author Philip Cho, Tianqi Checn, Egor Smirnov @@ -7,15 +7,15 @@ #include #include -#include -#include -#include #include -#include +#include #include +#include #include +#include #include #include +#include #include "xgboost/logging.h" #include "xgboost/tree_updater.h" @@ -111,32 +111,24 @@ void QuantileHistMaker::Update(HostDeviceVector *gpair, bool QuantileHistMaker::UpdatePredictionCache( const DMatrix* data, HostDeviceVector* out_preds) { - if (param_.subsample < 1.0f) { - return false; + if (hist_maker_param_.single_precision_histogram && float_builder_) { + return float_builder_->UpdatePredictionCache(data, out_preds); + } else if (double_builder_) { + return double_builder_->UpdatePredictionCache(data, out_preds); } else { - if (hist_maker_param_.single_precision_histogram && float_builder_) { - return float_builder_->UpdatePredictionCache(data, out_preds); - } else if (double_builder_) { - return double_builder_->UpdatePredictionCache(data, out_preds); - } else { - return false; - } + return false; } } bool QuantileHistMaker::UpdatePredictionCacheMulticlass( const DMatrix* data, HostDeviceVector* out_preds, const int gid, const int ngroup) { - if (param_.subsample < 1.0f) { - return false; + if (hist_maker_param_.single_precision_histogram && float_builder_) { + return float_builder_->UpdatePredictionCache(data, out_preds, gid, ngroup); + } else if (double_builder_) { + return double_builder_->UpdatePredictionCache(data, out_preds, gid, ngroup); } else { - if (hist_maker_param_.single_precision_histogram && float_builder_) { - return float_builder_->UpdatePredictionCache(data, out_preds, gid, ngroup); - } else if (double_builder_) { - return double_builder_->UpdatePredictionCache(data, out_preds, gid, ngroup); - } else { - return false; - } + return false; } } @@ -615,6 +607,7 @@ void QuantileHistMaker::Builder::Update( tree_evaluator_ = TreeEvaluator(param_, p_fmat->Info().num_col_, GenericParameter::kCpuId); interaction_constraints_.Reset(); + p_last_fmat_mutable_ = p_fmat; this->InitData(gmat, gpair_h, *p_fmat, *p_tree); if (param_.grow_policy == TrainParam::kLossGuide) { @@ -639,18 +632,14 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( HostDeviceVector* p_out_preds, const int gid, const int ngroup) { // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in // conjunction with Update(). - if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) { + if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_ || + p_last_fmat_ != p_last_fmat_mutable_) { return false; } builder_monitor_.Start("UpdatePredictionCache"); std::vector& out_preds = p_out_preds->HostVector(); - if (leaf_value_cache_.empty()) { - leaf_value_cache_.resize(p_last_tree_->param.num_nodes, - std::numeric_limits::infinity()); - } - CHECK_GT(out_preds.size(), 0U); size_t n_nodes = row_set_collection_.end() - row_set_collection_.begin(); @@ -680,30 +669,60 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( } }); + if (param_.subsample < 1.0f) { + // Making a real prediction for the remaining rows + size_t fvecs_size = feat_vecs_.size(); + feat_vecs_.resize(omp_get_max_threads(), RegTree::FVec()); + while (fvecs_size < feat_vecs_.size()) { + feat_vecs_[fvecs_size++].Init(data->Info().num_col_); + } + for (auto&& batch : p_last_fmat_mutable_->GetBatches()) { + HostSparsePageView page_view = batch.GetView(); + const auto num_parallel_ops = static_cast(unused_rows_.size()); + common::ParallelFor(num_parallel_ops, [&](bst_omp_uint block_id) { + RegTree::FVec &feats = feat_vecs_[omp_get_thread_num()]; + const SparsePage::Inst inst = page_view[unused_rows_[block_id]]; + feats.Fill(inst); + + const size_t row_num = unused_rows_[block_id] + batch.base_rowid; + const int lid = feats.HasMissing() ? p_last_tree_->GetLeafIndex(feats) : + p_last_tree_->GetLeafIndex(feats); + out_preds[row_num * ngroup + gid] += (*p_last_tree_)[lid].LeafValue(); + + feats.Drop(inst); + }); + } + } builder_monitor_.Stop("UpdatePredictionCache"); return true; } + template void QuantileHistMaker::Builder::InitSampling(const std::vector& gpair, const DMatrix& fmat, std::vector* row_indices) { const auto& info = fmat.Info(); auto& rnd = common::GlobalRandom(); - std::vector& row_indices_local = *row_indices; - size_t* p_row_indices = row_indices_local.data(); + unused_rows_.resize(info.num_row_); + size_t* p_row_indices_used = row_indices->data(); + size_t* p_row_indices_unused = unused_rows_.data(); #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG std::bernoulli_distribution coin_flip(param_.subsample); - size_t j = 0; + size_t used = 0, unused = 0; for (size_t i = 0; i < info.num_row_; ++i) { if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) { - p_row_indices[j++] = i; + p_row_indices_used[used++] = i; + } else { + p_row_indices_unused[unused++] = i; } } /* resize row_indices to reduce memory */ - row_indices_local.resize(j); + row_indices->resize(used); + unused_rows_.resize(unused); #else const size_t nthread = this->nthread_; - std::vector row_offsets(nthread, 0); + std::vector row_offsets_used(nthread, 0); + std::vector row_offsets_unused(nthread, 0); /* usage of mt19937_64 give 2x speed up for subsampling */ std::vector rnds(nthread); /* create engine for each thread */ @@ -725,27 +744,51 @@ void QuantileHistMaker::Builder::InitSampling(const std::vector= 0.0f && rnds[tid]() < coin_flip_border) { - p_row_indices[ibegin + row_offsets[tid]++] = i; + p_row_indices_used[ibegin + row_offsets_used[tid]++] = i; + } else { + p_row_indices_unused[ibegin + row_offsets_unused[tid]++] = i; } } + + #pragma omp barrier + + if (tid == 0ul) { + size_t prefix_sum_used = row_offsets_used[0]; + for (size_t i = 1; i < nthread; ++i) { + const size_t ibegin = i * discard_size; + + for (size_t k = 0; k < row_offsets_used[i]; ++k) { + p_row_indices_used[prefix_sum_used + k] = p_row_indices_used[ibegin + k]; + } + + prefix_sum_used += row_offsets_used[i]; + } + /* resize row_indices to reduce memory */ + row_indices->resize(prefix_sum_used); + } + + if (nthread == 1ul || tid == 1ul) { + size_t prefix_sum_unused = row_offsets_unused[0]; + for (size_t i = 1; i < nthread; ++i) { + const size_t ibegin = i * discard_size; + + for (size_t k = 0; k < row_offsets_unused[i]; ++k) { + p_row_indices_unused[prefix_sum_unused + k] = p_row_indices_unused[ibegin + k]; + } + + prefix_sum_unused += row_offsets_unused[i]; + } + /* resize row_indices to reduce memory */ + unused_rows_.resize(prefix_sum_unused); + } }); } exc.Rethrow(); /* discard global engine */ rnd = rnds[nthread - 1]; - size_t prefix_sum = row_offsets[0]; - for (size_t i = 1; i < nthread; ++i) { - const size_t ibegin = i * discard_size; - - for (size_t k = 0; k < row_offsets[i]; ++k) { - row_indices_local[prefix_sum + k] = row_indices_local[ibegin + k]; - } - prefix_sum += row_offsets[i]; - } - /* resize row_indices to reduce memory */ - row_indices_local.resize(prefix_sum); #endif // XGBOOST_CUSTOMIZE_GLOBAL_PRNG } + template void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, const std::vector& gpair, @@ -764,8 +807,6 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& { // initialize the row set row_set_collection_.Clear(); - // clear local prediction cache - leaf_value_cache_.clear(); // initialize histogram collection uint32_t nbins = gmat.cut.Ptrs().back(); hist_.Init(nbins); @@ -793,6 +834,9 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& << "Only uniform sampling is supported, " << "gradient-based sampling is only support by GPU Hist."; InitSampling(gpair, fmat, &row_indices); + // We should check that the partitioning was done correctly + // and each row of the dataset fell into exactly one of the categories + CHECK_EQ(row_indices.size() + unused_rows_.size(), info.num_row_); } else { MemStackAllocator buff(this->nthread_); bool* p_buff = buff.Get(); diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index d408b16b0..36019beae 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -1,5 +1,5 @@ /*! - * Copyright 2017-2018 by Contributors + * Copyright 2017-2021 by Contributors * \file updater_quantile_hist.h * \brief use quantized feature values to construct a tree * \author Philip Cho, Tianqi Chen, Egor Smirnov @@ -11,13 +11,12 @@ #include #include -#include -#include -#include -#include #include -#include +#include +#include +#include #include +#include #include "xgboost/data.h" #include "xgboost/json.h" @@ -409,6 +408,10 @@ class QuantileHistMaker: public TreeUpdater { common::ColumnSampler column_sampler_; // the internal row sets RowSetCollection row_set_collection_; + // tree rows that were not used for current training + std::vector unused_rows_; + // feature vectors for subsampled prediction + std::vector feat_vecs_; // the temp space for split std::vector row_split_tloc_; std::vector best_split_tloc_; @@ -422,8 +425,6 @@ class QuantileHistMaker: public TreeUpdater { /*! \brief feature with least # of bins. to be used for dense specialization of InitNewNode() */ uint32_t fid_least_bins_; - /*! \brief local prediction cache; maps node id to leaf value */ - std::vector leaf_value_cache_; GHistBuilder hist_builder_; std::unique_ptr pruner_; @@ -435,6 +436,7 @@ class QuantileHistMaker: public TreeUpdater { // back pointers to tree and data matrix const RegTree* p_last_tree_; DMatrix const* const p_last_fmat_; + DMatrix* p_last_fmat_mutable_; using ExpandQueue = std::priority_queue, diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 9a62d9ba3..ab1454bae 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -183,7 +183,7 @@ TEST(CpuPredictor, InplacePredict) { } } -TEST(CpuPredictor, UpdatePredictionCache) { +void TestUpdatePredictionCache(bool use_subsampling) { size_t constexpr kRows = 64, kCols = 16, kClasses = 4; LearnerModelParam mparam; mparam.num_feature = kCols; @@ -198,6 +198,9 @@ TEST(CpuPredictor, UpdatePredictionCache) { std::map cfg; cfg["tree_method"] = "hist"; cfg["predictor"] = "cpu_predictor"; + if (use_subsampling) { + cfg["subsample"] = "0.5"; + } Args args = {cfg.cbegin(), cfg.cend()}; gbm->Configure(args); @@ -226,6 +229,11 @@ TEST(CpuPredictor, UpdatePredictionCache) { } } +TEST(CpuPredictor, UpdatePredictionCache) { + TestUpdatePredictionCache(false); + TestUpdatePredictionCache(true); +} + TEST(CpuPredictor, LesserFeatures) { TestPredictionWithLesserFeatures("cpu_predictor"); } diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index 9167e5747..069952a88 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -1,5 +1,5 @@ /*! - * Copyright 2018-2019 by Contributors + * Copyright 2018-2021 by Contributors */ #include #include @@ -107,19 +107,45 @@ class QuantileHistMock : public QuantileHistMaker { const size_t nthreads = omp_get_num_threads(); // save state of global rng engine auto initial_rnd = common::GlobalRandom(); + std::vector unused_rows_cpy = this->unused_rows_; RealImpl::InitData(gmat, gpair, *p_fmat, tree); std::vector row_indices_initial = *(this->row_set_collection_.Data()); + std::vector unused_row_indices_initial = this->unused_rows_; + auto check_each_row_occurs_in_one_of_arrays = [](const std::vector& first, + const std::vector& second, + size_t nrows) { + std::vector arr_union(nrows); + for (auto&& row_indice : first) { + ++arr_union[row_indice]; + } + for (auto&& row_indice : second) { + ++arr_union[row_indice]; + } + for (auto&& row_cnt : arr_union) { + ASSERT_EQ(row_cnt, 1ul); + } + }; + check_each_row_occurs_in_one_of_arrays(row_indices_initial, unused_row_indices_initial, + p_fmat->Info().num_row_); for (size_t i_nthreads = 1; i_nthreads < 4; ++i_nthreads) { omp_set_num_threads(i_nthreads); // return initial state of global rng engine common::GlobalRandom() = initial_rnd; + this->unused_rows_ = unused_rows_cpy; RealImpl::InitData(gmat, gpair, *p_fmat, tree); std::vector& row_indices = *(this->row_set_collection_.Data()); ASSERT_EQ(row_indices_initial.size(), row_indices.size()); for (size_t i = 0; i < row_indices_initial.size(); ++i) { ASSERT_EQ(row_indices_initial[i], row_indices[i]); } + std::vector& unused_row_indices = this->unused_rows_; + ASSERT_EQ(unused_row_indices_initial.size(), unused_row_indices.size()); + for (size_t i = 0; i < unused_row_indices_initial.size(); ++i) { + ASSERT_EQ(unused_row_indices_initial[i], unused_row_indices[i]); + } + check_each_row_occurs_in_one_of_arrays(row_indices, unused_row_indices, + p_fmat->Info().num_row_); } omp_set_num_threads(nthreads); }