diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 3057dcdc9..09955302d 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -451,8 +451,6 @@ class DMatrix { // the following are column meta data, should be able to answer them fast. /*! \return Whether the data columns single column block. */ virtual bool SingleColBlock() const = 0; - /*! \brief get column density */ - virtual float GetColDensity(size_t cidx) = 0; /*! \brief virtual destructor */ virtual ~DMatrix() = default; diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index 345f20fd4..86974e09c 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -16,21 +16,6 @@ MetaInfo& SimpleDMatrix::Info() { return info; } const MetaInfo& SimpleDMatrix::Info() const { return info; } -float SimpleDMatrix::GetColDensity(size_t cidx) { - size_t column_size = 0; - // Use whatever version of column batches already exists - if (sorted_column_page_) { - auto batch = this->GetBatches(); - column_size = (*batch.begin())[cidx].size(); - } else { - auto batch = this->GetBatches(); - column_size = (*batch.begin())[cidx].size(); - } - - size_t nmiss = this->Info().num_row_ - column_size; - return 1.0f - (static_cast(nmiss)) / this->Info().num_row_; -} - BatchSet SimpleDMatrix::GetRowBatches() { // since csr is the default data structure so `source_` is always available. auto begin_iter = BatchIterator( @@ -76,8 +61,6 @@ BatchSet SimpleDMatrix::GetEllpackBatches(const BatchParam& param) return BatchSet(begin_iter); } -bool SimpleDMatrix::SingleColBlock() const { return true; } - template SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { // Set number of threads but keep old value so we can reset it after diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h index 2fdd53036..f7768b01f 100644 --- a/src/data/simple_dmatrix.h +++ b/src/data/simple_dmatrix.h @@ -30,9 +30,7 @@ class SimpleDMatrix : public DMatrix { const MetaInfo& Info() const override; - float GetColDensity(size_t cidx) override; - - bool SingleColBlock() const override; + bool SingleColBlock() const override { return true; } /*! \brief magic number used to identify SimpleDMatrix binary files */ static const int kMagic = 0xffffab01; diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc index be0730ceb..27abd8c31 100644 --- a/src/data/sparse_page_dmatrix.cc +++ b/src/data/sparse_page_dmatrix.cc @@ -58,28 +58,6 @@ BatchSet SparsePageDMatrix::GetEllpackBatches(const BatchParam& par return BatchSet(begin_iter); } -float SparsePageDMatrix::GetColDensity(size_t cidx) { - // Finds densities if we don't already have them - if (col_density_.empty()) { - std::vector column_size(this->Info().num_col_); - for (const auto &batch : this->GetBatches()) { - for (auto i = 0u; i < batch.Size(); i++) { - column_size[i] += batch[i].size(); - } - } - col_density_.resize(column_size.size()); - for (auto i = 0u; i < col_density_.size(); i++) { - size_t nmiss = this->Info().num_row_ - column_size[i]; - col_density_[i] = - 1.0f - (static_cast(nmiss)) / this->Info().num_row_; - } - } - return col_density_.at(cidx); -} - -bool SparsePageDMatrix::SingleColBlock() const { - return false; -} } // namespace data } // namespace xgboost #endif // DMLC_ENABLE_STD_THREAD diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h index 46a48a8be..1116d1c88 100644 --- a/src/data/sparse_page_dmatrix.h +++ b/src/data/sparse_page_dmatrix.h @@ -37,9 +37,7 @@ class SparsePageDMatrix : public DMatrix { const MetaInfo& Info() const override; - float GetColDensity(size_t cidx) override; - - bool SingleColBlock() const override; + bool SingleColBlock() const override { return false; } private: BatchSet GetRowBatches() override; diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu index d5287fb70..edf207a9c 100644 --- a/src/linear/updater_gpu_coordinate.cu +++ b/src/linear/updater_gpu_coordinate.cu @@ -61,7 +61,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT CHECK(p_fmat->SingleColBlock()); SparsePage const& batch = *(p_fmat->GetBatches().begin()); - if ( IsEmpty() ) { return; } + if (IsEmpty()) { + return; + } + dh::safe_cuda(cudaSetDevice(learner_param_->gpu_id)); // The begin and end indices for the section of each column associated with // this device diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 5b0f859c3..965922136 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -77,6 +77,24 @@ class ColMaker: public TreeUpdater { return "grow_colmaker"; } + void LazyGetColumnDensity(DMatrix *dmat) { + // Finds densities if we don't already have them + if (column_densities_.empty()) { + std::vector column_size(dmat->Info().num_col_); + for (const auto &batch : dmat->GetBatches()) { + for (auto i = 0u; i < batch.Size(); i++) { + column_size[i] += batch[i].size(); + } + } + column_densities_.resize(column_size.size()); + for (auto i = 0u; i < column_densities_.size(); i++) { + size_t nmiss = dmat->Info().num_row_ - column_size[i]; + column_densities_[i] = + 1.0f - (static_cast(nmiss)) / dmat->Info().num_row_; + } + } + } + void Update(HostDeviceVector *gpair, DMatrix* dmat, const std::vector &trees) override { @@ -84,6 +102,7 @@ class ColMaker: public TreeUpdater { LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't " "support distributed training."; } + this->LazyGetColumnDensity(dmat); // rescale learning rate according to size of trees float lr = param_.learning_rate; param_.learning_rate = lr / trees.size(); @@ -94,7 +113,7 @@ class ColMaker: public TreeUpdater { param_, colmaker_param_, std::unique_ptr(spliteval_->GetHostClone()), - interaction_constraints_); + interaction_constraints_, column_densities_); builder.Update(gpair->ConstHostVector(), dmat, tree); } param_.learning_rate = lr; @@ -106,6 +125,7 @@ class ColMaker: public TreeUpdater { ColMakerTrainParam colmaker_param_; // SplitEvaluator that will be cloned for each Builder std::unique_ptr spliteval_; + std::vector column_densities_; FeatureInteractionConstraintHost interaction_constraints_; // data structure @@ -139,11 +159,13 @@ class ColMaker: public TreeUpdater { explicit Builder(const TrainParam& param, const ColMakerTrainParam& colmaker_train_param, std::unique_ptr spliteval, - FeatureInteractionConstraintHost _interaction_constraints) + FeatureInteractionConstraintHost _interaction_constraints, + const std::vector &column_densities) : param_(param), colmaker_train_param_{colmaker_train_param}, nthread_(omp_get_max_threads()), spliteval_(std::move(spliteval)), - interaction_constraints_{std::move(_interaction_constraints)} {} + interaction_constraints_{std::move(_interaction_constraints)}, + column_densities_(column_densities) {} // update one tree, growing virtual void Update(const std::vector& gpair, DMatrix* p_fmat, @@ -433,22 +455,14 @@ class ColMaker: public TreeUpdater { #endif // defined(_OPENMP) { - std::vector densities(num_features); - CHECK_EQ(feat_set.size(), num_features); - for (bst_omp_uint i = 0; i < num_features; ++i) { - bst_feature_t const fid = feat_set[i]; - densities.at(i) = p_fmat->GetColDensity(fid); - } - #pragma omp parallel for schedule(dynamic, batch_size) for (bst_omp_uint i = 0; i < num_features; ++i) { bst_feature_t const fid = feat_set[i]; int32_t const tid = omp_get_thread_num(); auto c = batch[fid]; const bool ind = c.size() != 0 && c[0].fvalue == c[c.size() - 1].fvalue; - auto const density = densities[i]; if (colmaker_train_param_.NeedForwardSearch( - param_.default_direction, density, ind)) { + param_.default_direction, column_densities_[fid], ind)) { this->EnumerateSplit(c.data(), c.data() + c.size(), +1, fid, gpair, stemp_[tid]); } @@ -598,6 +612,7 @@ class ColMaker: public TreeUpdater { std::unique_ptr spliteval_; FeatureInteractionConstraintHost interaction_constraints_; + const std::vector &column_densities_; }; }; @@ -620,11 +635,12 @@ class DistColMaker : public ColMaker { DMatrix* dmat, const std::vector &trees) override { CHECK_EQ(trees.size(), 1U) << "DistColMaker: only support one tree at a time"; + this->LazyGetColumnDensity(dmat); Builder builder( param_, colmaker_param_, std::unique_ptr(spliteval_->GetHostClone()), - interaction_constraints_); + interaction_constraints_, column_densities_); // build the tree builder.Update(gpair->ConstHostVector(), dmat, trees[0]); //// prune the tree, note that pruner will sync the tree @@ -637,12 +653,14 @@ class DistColMaker : public ColMaker { class Builder : public ColMaker::Builder { public: explicit Builder(const TrainParam ¶m, - ColMakerTrainParam const& colmaker_train_param, + ColMakerTrainParam const &colmaker_train_param, std::unique_ptr spliteval, - FeatureInteractionConstraintHost _interaction_constraints) + FeatureInteractionConstraintHost _interaction_constraints, + const std::vector &column_densities) : ColMaker::Builder(param, colmaker_train_param, std::move(spliteval), - std::move(_interaction_constraints)) {} + std::move(_interaction_constraints), + column_densities) {} inline void UpdatePosition(DMatrix* p_fmat, const RegTree &tree) { const auto ndata = static_cast(p_fmat->Info().num_row_); #pragma omp parallel for schedule(static) diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index 79f505684..b5295bd3a 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -51,9 +51,6 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) { CreateSimpleTestData(tmp_file); xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, true, false); - // Sorted column access - EXPECT_EQ(dmat->GetColDensity(0), 1); - EXPECT_EQ(dmat->GetColDensity(1), 0.5); ASSERT_TRUE(dmat->SingleColBlock()); // Loop over the batches and assert the data is as expected diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index f1356eddb..356b2fba4 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -47,9 +47,6 @@ TEST(SparsePageDMatrix, ColAccess) { xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false); - EXPECT_EQ(dmat->GetColDensity(0), 1); - EXPECT_EQ(dmat->GetColDensity(1), 0.5); - // Loop over the batches and assert the data is as expected for (auto const &col_batch : dmat->GetBatches()) { EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_); diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 111a75028..5d4c12e2d 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -96,8 +96,9 @@ TEST(Learner, SLOW_CheckMultiBatch) { // Create sufficiently large data to make two row pages dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/big.libsvm"; - CreateBigTestData(tmp_file, 5000000); - std::shared_ptr dmat(xgboost::DMatrix::Load( tmp_file + "#" + tmp_file + ".cache", true, false)); + CreateBigTestData(tmp_file, 50000); + std::shared_ptr dmat(xgboost::DMatrix::Load( + tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 100)); EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); EXPECT_FALSE(dmat->SingleColBlock()); size_t num_row = dmat->Info().num_row_;