From 549c8d6ae998a5aa4578102a517e4b4def8761c9 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Sun, 17 Feb 2019 16:01:07 -0800 Subject: [PATCH] Prevent empty quantiles in fast hist (#4155) * Prevent empty quantiles * Revise and improve unit tests for quantile hist * Remove unnecessary comment * Add #2943 as a test case * Skip test if no sklearn * Revise misleading comments --- src/common/hist_util.cc | 21 ++-- tests/cpp/tree/test_quantile_hist.cc | 182 ++++++++++++++++++++++----- tests/python/test_updaters.py | 15 +++ 3 files changed, 179 insertions(+), 39 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index ad2e1696e..8e1340313 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -148,14 +148,17 @@ void HistCutMatrix::Init } } // push a value that is greater than anything - if (a.size != 0) { - bst_float cpt = a.data[a.size - 1].value; - // this must be bigger than last value in a scale - bst_float last = cpt + (fabs(cpt) + 1e-5); - cut.push_back(last); - } + const bst_float cpt + = (a.size > 0) ? a.data[a.size - 1].value : this->min_val[fid]; + // this must be bigger than last value in a scale + const bst_float last = cpt + (fabs(cpt) + 1e-5); + cut.push_back(last); - row_ptr.push_back(static_cast(cut.size())); + // Ensure that every feature gets at least one quantile point + CHECK_LE(cut.size(), std::numeric_limits::max()); + auto cut_size = static_cast(cut.size()); + CHECK_GT(cut_size, row_ptr.back()); + row_ptr.push_back(cut_size); } } @@ -165,7 +168,9 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) { auto cend = cut.begin() + row_ptr[fid + 1]; CHECK(cbegin != cend); auto it = std::upper_bound(cbegin, cend, e.fvalue); - if (it == cend) it = cend - 1; + if (it == cend) { + it = cend - 1; + } uint32_t idx = static_cast(it - cut.begin()); return idx; } diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index 47ed36c92..819860e8a 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -4,11 +4,13 @@ #include "../helpers.h" #include "../../../src/tree/param.h" #include "../../../src/tree/updater_quantile_hist.h" +#include "../../../src/tree/split_evaluator.h" #include "../../../src/common/host_device_vector.h" #include #include +#include #include #include @@ -22,43 +24,105 @@ class QuantileHistMock : public QuantileHistMaker { using RealImpl = QuantileHistMaker::Builder; BuilderMock(const TrainParam& param, - std::unique_ptr pruner, - std::unique_ptr spliteval) + std::unique_ptr pruner, + std::unique_ptr spliteval) : RealImpl(param, std::move(pruner), std::move(spliteval)) {} public: void TestInitData(const GHistIndexMatrix& gmat, - const std::vector& gpair, - const DMatrix& fmat, - const RegTree& tree) { - RealImpl::InitData(gmat, gpair, fmat, tree); + const std::vector& gpair, + DMatrix* p_fmat, + const RegTree& tree) { + RealImpl::InitData(gmat, gpair, *p_fmat, tree); ASSERT_EQ(data_layout_, kSparseData); + + /* The creation of HistCutMatrix and GHistIndexMatrix are not technically + * part of QuantileHist updater logic, but we include it here because + * QuantileHist updater object currently stores GHistIndexMatrix + * internally. According to https://github.com/dmlc/xgboost/pull/3803, + * we should eventually move GHistIndexMatrix out of the QuantileHist + * updater. */ + + const size_t num_row = p_fmat->Info().num_row_; + const size_t num_col = p_fmat->Info().num_col_; + /* Validate HistCutMatrix */ + ASSERT_EQ(gmat.cut.row_ptr.size(), num_col + 1); + for (size_t fid = 0; fid < num_col; ++fid) { + // Each feature must have at least one quantile point (cut) + const size_t ibegin = gmat.cut.row_ptr[fid]; + const size_t iend = gmat.cut.row_ptr[fid + 1]; + ASSERT_LT(ibegin, iend); + for (size_t i = ibegin; i < iend - 1; ++i) { + // Quantile points must be sorted in ascending order + // No duplicates allowed + ASSERT_LT(gmat.cut.cut[i], gmat.cut.cut[i + 1]); + } + } + + /* Validate GHistIndexMatrix */ + ASSERT_EQ(gmat.row_ptr.size(), num_row + 1); + ASSERT_LT(*std::max_element(gmat.index.begin(), gmat.index.end()), + gmat.cut.row_ptr.back()); + for (const auto& batch : p_fmat->GetRowBatches()) { + for (size_t i = 0; i < batch.Size(); ++i) { + const size_t rid = batch.base_rowid + i; + ASSERT_LT(rid, num_row); + const size_t gmat_row_offset = gmat.row_ptr[rid]; + ASSERT_LT(gmat_row_offset, gmat.index.size()); + SparsePage::Inst inst = batch[i]; + ASSERT_EQ(gmat.row_ptr[rid] + inst.size(), gmat.row_ptr[rid + 1]); + for (size_t j = 0; j < inst.size(); ++j) { + // Each entry of GHistIndexMatrix represents a bin ID + const size_t bin_id = gmat.index[gmat_row_offset + j]; + const size_t fid = inst[j].index; + // The bin ID must correspond to correct feature + ASSERT_GE(bin_id, gmat.cut.row_ptr[fid]); + ASSERT_LT(bin_id, gmat.cut.row_ptr[fid + 1]); + // The bin ID must correspond to a region between two + // suitable quantile points + ASSERT_LT(inst[j].fvalue, gmat.cut.cut[bin_id]); + if (bin_id > gmat.cut.row_ptr[fid]) { + ASSERT_GE(inst[j].fvalue, gmat.cut.cut[bin_id - 1]); + } else { + ASSERT_GE(inst[j].fvalue, gmat.cut.min_val[fid]); + } + } + } + } } void TestBuildHist(int nid, const GHistIndexMatrix& gmat, const DMatrix& fmat, const RegTree& tree) { - std::vector gpair = + const std::vector gpair = { {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f}, {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} }; RealImpl::InitData(gmat, gpair, fmat, tree); - GHistIndexBlockMatrix quantile_index_block; + GHistIndexBlockMatrix dummy; hist_.AddHistRow(nid); BuildHist(gpair, row_set_collection_[nid], - gmat, quantile_index_block, hist_[nid], false); - std::vector solution { - {0.27f, 0.29f}, {0.27f, 0.29f}, {0.47f, 0.49f}, - {0.27f, 0.29f}, {0.57f, 0.59f}, {0.26f, 0.27f}, - {0.37f, 0.39f}, {0.23f, 0.24f}, {0.37f, 0.39f}, - {0.27f, 0.28f}, {0.27f, 0.29f}, {0.37f, 0.39f}, - {0.26f, 0.27f}, {0.23f, 0.24f}, {0.57f, 0.59f}, - {0.47f, 0.49f}, {0.47f, 0.49f}, {0.37f, 0.39f}, - {0.26f, 0.27f}, {0.23f, 0.24f}, {0.27f, 0.28f}, - {0.57f, 0.59f}, {0.23f, 0.24f}, {0.47f, 0.49f}}; + gmat, dummy, hist_[nid], false); + // Check if number of histogram bins is correct + ASSERT_EQ(hist_[nid].size(), gmat.cut.row_ptr.back()); + std::vector histogram_expected(hist_[nid].size()); + + // Compute the correct histogram (histogram_expected) + const size_t num_row = fmat.Info().num_row_; + CHECK_EQ(gpair.size(), num_row); + for (size_t rid = 0; rid < num_row; ++rid) { + const size_t ibegin = gmat.row_ptr[rid]; + const size_t iend = gmat.row_ptr[rid + 1]; + for (size_t i = ibegin; i < iend; ++i) { + const size_t bin_id = gmat.index[i]; + histogram_expected[bin_id] += GradientPairPrecise(gpair[rid]); + } + } + + // Now validate the computed histogram returned by BuildHist for (size_t i = 0; i < hist_[nid].size(); ++i) { - GradientPairPrecise sol = solution[i]; + GradientPairPrecise sol = histogram_expected[i]; ASSERT_NEAR(sol.GetGrad(), hist_[nid][i].GetGrad(), kEps); ASSERT_NEAR(sol.GetHess(), hist_[nid][i].GetHess(), kEps); } @@ -67,10 +131,11 @@ class QuantileHistMock : public QuantileHistMaker { void TestEvaluateSplit(const GHistIndexBlockMatrix& quantile_index_block, const RegTree& tree) { std::vector row_gpairs = - { {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f}, - {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} }; + { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f}, + {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} }; size_t constexpr max_bins = 4; - auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3); // dense + auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3); + // dense, no missing values common::GHistIndexMatrix gmat; gmat.Init((*dmat).get(), max_bins); @@ -82,14 +147,67 @@ class QuantileHistMock : public QuantileHistMaker { gmat, quantile_index_block, hist_[0], false); RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree); - // Manipulate the root_gain so that I don't have to invent an actual - // split. Yes, I'm cheating. - snode_[0].root_gain = 0.8; - RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree); - ASSERT_NEAR(snode_.at(0).best.loss_chg, 0.7128048, kEps); - ASSERT_EQ(snode_.at(0).best.SplitIndex(), 10); - ASSERT_NEAR(snode_.at(0).best.split_value, 0.182258, kEps); + /* Compute correct split (best_split) using the computed histogram */ + const size_t num_row = dmat->get()->Info().num_row_; + const size_t num_feature = dmat->get()->Info().num_col_; + CHECK_EQ(num_row, row_gpairs.size()); + // Compute total gradient for all data points + GradientPairPrecise total_gpair; + for (const auto& e : row_gpairs) { + total_gpair += GradientPairPrecise(e); + } + // Initialize split evaluator + std::unique_ptr evaluator(SplitEvaluator::Create("elastic_net")); + evaluator->Init({}); + + // Now enumerate all feature*threshold combination to get best split + // To simplify logic, we make some assumptions: + // 1) no missing values in data + // 2) no regularization, i.e. set min_child_weight, reg_lambda, reg_alpha, + // and max_delta_step to 0. + bst_float best_split_gain = 0.0f; + size_t best_split_threshold, best_split_feature; + // Enumerate all features + for (size_t fid = 0; fid < num_feature; ++fid) { + const size_t bin_id_min = gmat.cut.row_ptr[fid]; + const size_t bin_id_max = gmat.cut.row_ptr[fid + 1]; + // Enumerate all bin ID in [bin_id_min, bin_id_max), i.e. every possible + // choice of thresholds for feature fid + for (size_t split_thresh = bin_id_min; + split_thresh < bin_id_max; ++split_thresh) { + // left_sum, right_sum: Gradient sums for data points whose feature + // value is left/right side of the split threshold + GradientPairPrecise left_sum, right_sum; + for (size_t rid = 0; rid < num_row; ++rid) { + for (size_t offset = gmat.row_ptr[rid]; + offset < gmat.row_ptr[rid + 1]; ++offset) { + const size_t bin_id = gmat.index[offset]; + if (bin_id >= bin_id_min && bin_id < bin_id_max) { + if (bin_id <= split_thresh) { + left_sum += GradientPairPrecise(row_gpairs[rid]); + } else { + right_sum += GradientPairPrecise(row_gpairs[rid]); + } + } + } + } + // Now compute gain (change in loss) + const auto split_gain + = evaluator->ComputeSplitScore(0, fid, GradStats(left_sum), + GradStats(right_sum)); + if (split_gain > best_split_gain) { + best_split_gain = split_gain; + best_split_feature = fid; + best_split_threshold = split_thresh; + } + } + } + + /* Now compare against result given by EvaluateSplit() */ + RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree); + ASSERT_EQ(snode_[0].best.SplitIndex(), best_split_feature); + ASSERT_EQ(snode_[0].best.split_value, gmat.cut.cut[best_split_threshold]); delete dmat; } @@ -128,7 +246,7 @@ class QuantileHistMock : public QuantileHistMaker { { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }; - builder_->TestInitData(gmat, gpair, *(*dmat), tree); + builder_->TestInitData(gmat, gpair, dmat->get(), tree); } void TestBuildHist() { @@ -169,7 +287,9 @@ TEST(Updater, QuantileHist_BuildHist) { TEST(Updater, QuantileHist_EvalSplits) { std::vector> cfg {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())}, - {"split_evaluator", "elastic_net"}}; + {"split_evaluator", "elastic_net"}, + {"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"}, + {"min_child_weight", "0"}}; QuantileHistMock maker(cfg); maker.TestEvaluateSplit(); } diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index b19710e35..5cc41e551 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -2,6 +2,7 @@ import testing as tm import unittest import pytest import xgboost as xgb +import numpy as np try: from regression_test_utilities import run_suite, parameter_combinations, \ @@ -59,3 +60,17 @@ class TestUpdaters(unittest.TestCase): evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc'] + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_fast_histmaker_degenerate_case(self): + # Test a degenerate case where the quantile sketcher won't return any + # quantile points for a particular feature (the second feature in + # this example). Source: https://github.com/dmlc/xgboost/issues/2943 + nan = np.nan + param = {'missing': nan, 'tree_method': 'hist'} + model = xgb.XGBRegressor(**param) + X = [[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan], + [6.38888889e+05, nan], [6.28086420e+05, nan]] + y = [1000000., 0., 0., 500000.] + w = [0, 0, 1, 0] + model.fit(X, y, sample_weight=w)