From 754fe8142b295c95798b2cfb2031aa0875b69f19 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 16 Feb 2019 04:39:41 +0800 Subject: [PATCH] Make `HistCutMatrix::Init' be aware of groups. (#4115) * Add checks for group size. * Simple docs. * Search group index during hist cut matrix initialization. Co-authored-by: Jiaming Yuan Co-authored-by: Philip Hyunsu Cho --- doc/python/python_intro.rst | 4 ++ src/common/hist_util.cc | 57 +++++++++++++++++++++----- src/common/hist_util.h | 9 +++++ src/learner.cc | 24 ++++++++++- tests/cpp/common/test_hist_util.cc | 51 +++++++++++++++++++++++ tests/cpp/test_learner.cc | 65 ++++++++++++++++++++++++------ 6 files changed, 188 insertions(+), 22 deletions(-) create mode 100644 tests/cpp/common/test_hist_util.cc diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst index 4255cba4e..0c15f9fd5 100644 --- a/doc/python/python_intro.rst +++ b/doc/python/python_intro.rst @@ -101,6 +101,10 @@ The data is stored in a :py:class:`DMatrix ` object. w = np.random.rand(5, 1) dtrain = xgb.DMatrix(data, label=label, missing=-999.0, weight=w) +When performing ranking tasks, the number of weights should be equal +to number of groups. + + Setting Parameters ------------------ XGBoost can use either a list of pairs or a dictionary to set :doc:`parameters `. For instance: diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 422265a15..1646bd031 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -24,7 +24,25 @@ namespace xgboost { namespace common { +HistCutMatrix::HistCutMatrix() { + monitor_.Init("HistCutMatrix"); +} + +size_t HistCutMatrix::SearchGroupIndFromBaseRow( + std::vector const& group_ptr, size_t const base_rowid) const { + using KIt = std::vector::const_iterator; + KIt res = std::lower_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid); + // Cannot use CHECK_NE because it will try to print the iterator. + bool const found = res != group_ptr.cend() - 1; + if (!found) { + LOG(FATAL) << "Row " << base_rowid << " does not lie in any group!\n"; + } + size_t group_ind = std::distance(group_ptr.cbegin(), res); + return group_ind; +} + void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { + monitor_.Start("Init"); const MetaInfo& info = p_fmat->Info(); // safe factor for better accuracy @@ -33,30 +51,50 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { const int nthread = omp_get_max_threads(); - auto nstep = static_cast((info.num_col_ + nthread - 1) / nthread); - auto ncol = static_cast(info.num_col_); + unsigned const nstep = + static_cast((info.num_col_ + nthread - 1) / nthread); + unsigned const ncol = static_cast(info.num_col_); sketchs.resize(info.num_col_); for (auto& s : sketchs) { s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor)); } const auto& weights = info.weights_.HostVector(); + + // Data groups, used in ranking. + std::vector const& group_ptr = info.group_ptr_; + size_t const num_groups = group_ptr.size() == 0 ? 0 : group_ptr.size() - 1; + // Use group index for weights? + bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_; + for (const auto &batch : p_fmat->GetRowBatches()) { - #pragma omp parallel num_threads(nthread) + size_t group_ind = 0; + if (use_group_ind) { + group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid); + } +#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind) { CHECK_EQ(nthread, omp_get_num_threads()); auto tid = static_cast(omp_get_thread_num()); unsigned begin = std::min(nstep * tid, ncol); unsigned end = std::min(nstep * (tid + 1), ncol); + // do not iterate if no columns are assigned to the thread if (begin < end && end <= ncol) { for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*) - size_t ridx = batch.base_rowid + i; - SparsePage::Inst inst = batch[i]; - for (auto& ins : inst) { - if (ins.index >= begin && ins.index < end) { - sketchs[ins.index].Push(ins.fvalue, - weights.size() > 0 ? weights[ridx] : 1.0f); + size_t const ridx = batch.base_rowid + i; + SparsePage::Inst const inst = batch[i]; + if (use_group_ind && + group_ptr[group_ind] == ridx && + // maximum equals to weights.size() - 1 + group_ind < num_groups - 1) { + // move to next group + group_ind++; + } + for (auto const& entry : inst) { + if (entry.index >= begin && entry.index < end) { + size_t w_idx = use_group_ind ? group_ind : ridx; + sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx)); } } } @@ -65,6 +103,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { } Init(&sketchs, max_num_bins); + monitor_.Stop("Init"); } void HistCutMatrix::Init diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 64bf3ec0d..6c3be21b0 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -13,6 +13,7 @@ #include "row_set.h" #include "../tree/param.h" #include "./quantile.h" +#include "./timer.h" #include "../include/rabit/rabit.h" namespace xgboost { @@ -35,6 +36,14 @@ struct HistCutMatrix { void Init(DMatrix* p_fmat, uint32_t max_num_bins); void Init(std::vector* sketchs, uint32_t max_num_bins); + + HistCutMatrix(); + + protected: + virtual size_t SearchGroupIndFromBaseRow( + std::vector const& group_ptr, size_t const base_rowid) const; + + Monitor monitor_; }; /*! \brief Builds the cut matrix on the GPU */ diff --git a/src/learner.cc b/src/learner.cc index 0f015479a..4d0435608 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -474,12 +474,16 @@ class LearnerImpl : public Learner { void UpdateOneIter(int iter, DMatrix* train) override { monitor_.Start("UpdateOneIter"); + + // TODO(trivialfis): Merge the duplicated code with BoostOneIter CHECK(ModelInitialized()) << "Always call InitModel or LoadModel before update"; if (tparam_.seed_per_iteration || rabit::IsDistributed()) { common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter); } + this->ValidateDMatrix(train); this->PerformTreeMethodHeuristic(train); + monitor_.Start("PredictRaw"); this->PredictRaw(train, &preds_); monitor_.Stop("PredictRaw"); @@ -493,10 +497,15 @@ class LearnerImpl : public Learner { void BoostOneIter(int iter, DMatrix* train, HostDeviceVector* in_gpair) override { monitor_.Start("BoostOneIter"); + + CHECK(ModelInitialized()) + << "Always call InitModel or LoadModel before boost."; if (tparam_.seed_per_iteration || rabit::IsDistributed()) { common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter); } + this->ValidateDMatrix(train); this->PerformTreeMethodHeuristic(train); + gbm_->DoBoost(train, in_gpair); monitor_.Stop("BoostOneIter"); } @@ -711,7 +720,7 @@ class LearnerImpl : public Learner { mparam_.num_feature = num_feature; } CHECK_NE(mparam_.num_feature, 0) - << "0 feature is supplied. Are you using raw Booster?"; + << "0 feature is supplied. Are you using raw Booster interface?"; // setup cfg_["num_feature"] = common::ToString(mparam_.num_feature); CHECK(obj_ == nullptr && gbm_ == nullptr); @@ -736,6 +745,19 @@ class LearnerImpl : public Learner { gbm_->PredictBatch(data, out_preds, ntree_limit); } + void ValidateDMatrix(DMatrix* p_fmat) { + MetaInfo const& info = p_fmat->Info(); + auto const& weights = info.weights_.HostVector(); + if (info.group_ptr_.size() != 0 && weights.size() != 0) { + CHECK(weights.size() == info.group_ptr_.size() - 1) + << "\n" + << "weights size: " << weights.size() << ", " + << "groups size: " << info.group_ptr_.size() -1 << ", " + << "num rows: " << p_fmat->Info().num_row_ << "\n" + << "Number of weights should be equal to number of groups in ranking task."; + } + } + // model parameter LearnerModelParam mparam_; // training parameter diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc new file mode 100644 index 000000000..d959f486d --- /dev/null +++ b/tests/cpp/common/test_hist_util.cc @@ -0,0 +1,51 @@ +#include +#include +#include +#include + +#include "../../../src/common/hist_util.h" +#include "../helpers.h" + +namespace xgboost { +namespace common { + +class HistCutMatrixMock : public HistCutMatrix { + public: + size_t SearchGroupIndFromBaseRow( + std::vector const& group_ptr, size_t const base_rowid) { + return HistCutMatrix::SearchGroupIndFromBaseRow(group_ptr, base_rowid); + } +}; + +TEST(HistCutMatrix, SearchGroupInd) { + size_t constexpr kNumGroups = 4; + size_t constexpr kNumRows = 17; + size_t constexpr kNumCols = 15; + + auto pp_mat = CreateDMatrix(kNumRows, kNumCols, 0); + + auto& p_mat = *pp_mat; + std::vector group(kNumGroups); + group[0] = 2; + group[1] = 3; + group[2] = 7; + group[3] = 5; + + p_mat->Info().SetInfo( + "group", group.data(), DataType::kUInt32, kNumGroups); + + HistCutMatrixMock hmat; + + size_t group_ind = hmat.SearchGroupIndFromBaseRow(p_mat->Info().group_ptr_, 0); + ASSERT_EQ(group_ind, 0); + + group_ind = hmat.SearchGroupIndFromBaseRow(p_mat->Info().group_ptr_, 5); + ASSERT_EQ(group_ind, 2); + + EXPECT_ANY_THROW(hmat.SearchGroupIndFromBaseRow(p_mat->Info().group_ptr_, 17)); + + delete pp_mat; +} + +} // namespace common +} // namespace xgboost diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 46e68b9ec..e1c16f296 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -6,9 +6,9 @@ namespace xgboost { -TEST(learner, Test) { - typedef std::pair arg; - auto args = {arg("tree_method", "exact")}; +TEST(Learner, Basic) { + typedef std::pair Arg; + auto args = {Arg("tree_method", "exact")}; auto mat_ptr = CreateDMatrix(10, 10, 0); std::vector> mat = {*mat_ptr}; auto learner = std::unique_ptr(Learner::Create(mat)); @@ -17,33 +17,33 @@ TEST(learner, Test) { delete mat_ptr; } -TEST(learner, SelectTreeMethod) { - using arg = std::pair; +TEST(Learner, SelectTreeMethod) { + using Arg = std::pair; auto mat_ptr = CreateDMatrix(10, 10, 0); std::vector> mat = {*mat_ptr}; auto learner = std::unique_ptr(Learner::Create(mat)); // Test if `tree_method` can be set - learner->Configure({arg("tree_method", "approx")}); + learner->Configure({Arg("tree_method", "approx")}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_histmaker,prune"); - learner->Configure({arg("tree_method", "exact")}); + learner->Configure({Arg("tree_method", "exact")}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_colmaker,prune"); - learner->Configure({arg("tree_method", "hist")}); + learner->Configure({Arg("tree_method", "hist")}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_quantile_histmaker"); - learner->Configure({arg{"booster", "dart"}, arg{"tree_method", "hist"}}); + learner->Configure({Arg{"booster", "dart"}, Arg{"tree_method", "hist"}}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_quantile_histmaker"); #ifdef XGBOOST_USE_CUDA - learner->Configure({arg("tree_method", "gpu_exact")}); + learner->Configure({Arg("tree_method", "gpu_exact")}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_gpu,prune"); - learner->Configure({arg("tree_method", "gpu_hist")}); + learner->Configure({Arg("tree_method", "gpu_hist")}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_gpu_hist"); - learner->Configure({arg{"booster", "dart"}, arg{"tree_method", "gpu_hist"}}); + learner->Configure({Arg{"booster", "dart"}, Arg{"tree_method", "gpu_hist"}}); ASSERT_EQ(learner->GetConfigurationArguments().at("updater"), "grow_gpu_hist"); #endif @@ -51,4 +51,45 @@ TEST(learner, SelectTreeMethod) { delete mat_ptr; } +TEST(Learner, CheckGroup) { + using Arg = std::pair; + size_t constexpr kNumGroups = 4; + size_t constexpr kNumRows = 17; + size_t constexpr kNumCols = 15; + + auto pp_mat = CreateDMatrix(kNumRows, kNumCols, 0); + auto& p_mat = *pp_mat; + std::vector weight(kNumGroups); + std::vector group(kNumGroups); + group[0] = 2; + group[1] = 3; + group[2] = 7; + group[3] = 5; + std::vector labels (kNumRows); + for (size_t i = 0; i < kNumRows; ++i) { + labels[i] = i % 2; + } + + p_mat->Info().SetInfo( + "weight", static_cast(weight.data()), DataType::kFloat32, kNumGroups); + p_mat->Info().SetInfo( + "group", group.data(), DataType::kUInt32, kNumGroups); + p_mat->Info().SetInfo("label", labels.data(), DataType::kFloat32, kNumRows); + + std::vector> mat = {p_mat}; + auto learner = std::unique_ptr(Learner::Create(mat)); + learner->Configure({Arg{"objective", "rank:pairwise"}}); + learner->InitModel(); + + EXPECT_NO_THROW(learner->UpdateOneIter(0, p_mat.get())); + + group.resize(kNumGroups+1); + group[3] = 4; + group[4] = 1; + p_mat->Info().SetInfo("group", group.data(), DataType::kUInt32, kNumGroups+1); + EXPECT_ANY_THROW(learner->UpdateOneIter(0, p_mat.get())); + + delete pp_mat; +} + } // namespace xgboost