Prevent empty quantiles in fast hist (#4155)
* Prevent empty quantiles * Revise and improve unit tests for quantile hist * Remove unnecessary comment * Add #2943 as a test case * Skip test if no sklearn * Revise misleading comments
This commit is contained in:
parent
e1240413c9
commit
549c8d6ae9
@ -148,14 +148,17 @@ void HistCutMatrix::Init
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// push a value that is greater than anything
|
// push a value that is greater than anything
|
||||||
if (a.size != 0) {
|
const bst_float cpt
|
||||||
bst_float cpt = a.data[a.size - 1].value;
|
= (a.size > 0) ? a.data[a.size - 1].value : this->min_val[fid];
|
||||||
// this must be bigger than last value in a scale
|
// this must be bigger than last value in a scale
|
||||||
bst_float last = cpt + (fabs(cpt) + 1e-5);
|
const bst_float last = cpt + (fabs(cpt) + 1e-5);
|
||||||
cut.push_back(last);
|
cut.push_back(last);
|
||||||
}
|
|
||||||
|
|
||||||
row_ptr.push_back(static_cast<bst_uint>(cut.size()));
|
// Ensure that every feature gets at least one quantile point
|
||||||
|
CHECK_LE(cut.size(), std::numeric_limits<uint32_t>::max());
|
||||||
|
auto cut_size = static_cast<uint32_t>(cut.size());
|
||||||
|
CHECK_GT(cut_size, row_ptr.back());
|
||||||
|
row_ptr.push_back(cut_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,7 +168,9 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) {
|
|||||||
auto cend = cut.begin() + row_ptr[fid + 1];
|
auto cend = cut.begin() + row_ptr[fid + 1];
|
||||||
CHECK(cbegin != cend);
|
CHECK(cbegin != cend);
|
||||||
auto it = std::upper_bound(cbegin, cend, e.fvalue);
|
auto it = std::upper_bound(cbegin, cend, e.fvalue);
|
||||||
if (it == cend) it = cend - 1;
|
if (it == cend) {
|
||||||
|
it = cend - 1;
|
||||||
|
}
|
||||||
uint32_t idx = static_cast<uint32_t>(it - cut.begin());
|
uint32_t idx = static_cast<uint32_t>(it - cut.begin());
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,11 +4,13 @@
|
|||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "../../../src/tree/param.h"
|
#include "../../../src/tree/param.h"
|
||||||
#include "../../../src/tree/updater_quantile_hist.h"
|
#include "../../../src/tree/updater_quantile_hist.h"
|
||||||
|
#include "../../../src/tree/split_evaluator.h"
|
||||||
#include "../../../src/common/host_device_vector.h"
|
#include "../../../src/common/host_device_vector.h"
|
||||||
|
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -22,43 +24,105 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
using RealImpl = QuantileHistMaker::Builder;
|
using RealImpl = QuantileHistMaker::Builder;
|
||||||
|
|
||||||
BuilderMock(const TrainParam& param,
|
BuilderMock(const TrainParam& param,
|
||||||
std::unique_ptr<TreeUpdater> pruner,
|
std::unique_ptr<TreeUpdater> pruner,
|
||||||
std::unique_ptr<SplitEvaluator> spliteval)
|
std::unique_ptr<SplitEvaluator> spliteval)
|
||||||
: RealImpl(param, std::move(pruner), std::move(spliteval)) {}
|
: RealImpl(param, std::move(pruner), std::move(spliteval)) {}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void TestInitData(const GHistIndexMatrix& gmat,
|
void TestInitData(const GHistIndexMatrix& gmat,
|
||||||
const std::vector<GradientPair>& gpair,
|
const std::vector<GradientPair>& gpair,
|
||||||
const DMatrix& fmat,
|
DMatrix* p_fmat,
|
||||||
const RegTree& tree) {
|
const RegTree& tree) {
|
||||||
RealImpl::InitData(gmat, gpair, fmat, tree);
|
RealImpl::InitData(gmat, gpair, *p_fmat, tree);
|
||||||
ASSERT_EQ(data_layout_, kSparseData);
|
ASSERT_EQ(data_layout_, kSparseData);
|
||||||
|
|
||||||
|
/* The creation of HistCutMatrix and GHistIndexMatrix are not technically
|
||||||
|
* part of QuantileHist updater logic, but we include it here because
|
||||||
|
* QuantileHist updater object currently stores GHistIndexMatrix
|
||||||
|
* internally. According to https://github.com/dmlc/xgboost/pull/3803,
|
||||||
|
* we should eventually move GHistIndexMatrix out of the QuantileHist
|
||||||
|
* updater. */
|
||||||
|
|
||||||
|
const size_t num_row = p_fmat->Info().num_row_;
|
||||||
|
const size_t num_col = p_fmat->Info().num_col_;
|
||||||
|
/* Validate HistCutMatrix */
|
||||||
|
ASSERT_EQ(gmat.cut.row_ptr.size(), num_col + 1);
|
||||||
|
for (size_t fid = 0; fid < num_col; ++fid) {
|
||||||
|
// Each feature must have at least one quantile point (cut)
|
||||||
|
const size_t ibegin = gmat.cut.row_ptr[fid];
|
||||||
|
const size_t iend = gmat.cut.row_ptr[fid + 1];
|
||||||
|
ASSERT_LT(ibegin, iend);
|
||||||
|
for (size_t i = ibegin; i < iend - 1; ++i) {
|
||||||
|
// Quantile points must be sorted in ascending order
|
||||||
|
// No duplicates allowed
|
||||||
|
ASSERT_LT(gmat.cut.cut[i], gmat.cut.cut[i + 1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Validate GHistIndexMatrix */
|
||||||
|
ASSERT_EQ(gmat.row_ptr.size(), num_row + 1);
|
||||||
|
ASSERT_LT(*std::max_element(gmat.index.begin(), gmat.index.end()),
|
||||||
|
gmat.cut.row_ptr.back());
|
||||||
|
for (const auto& batch : p_fmat->GetRowBatches()) {
|
||||||
|
for (size_t i = 0; i < batch.Size(); ++i) {
|
||||||
|
const size_t rid = batch.base_rowid + i;
|
||||||
|
ASSERT_LT(rid, num_row);
|
||||||
|
const size_t gmat_row_offset = gmat.row_ptr[rid];
|
||||||
|
ASSERT_LT(gmat_row_offset, gmat.index.size());
|
||||||
|
SparsePage::Inst inst = batch[i];
|
||||||
|
ASSERT_EQ(gmat.row_ptr[rid] + inst.size(), gmat.row_ptr[rid + 1]);
|
||||||
|
for (size_t j = 0; j < inst.size(); ++j) {
|
||||||
|
// Each entry of GHistIndexMatrix represents a bin ID
|
||||||
|
const size_t bin_id = gmat.index[gmat_row_offset + j];
|
||||||
|
const size_t fid = inst[j].index;
|
||||||
|
// The bin ID must correspond to correct feature
|
||||||
|
ASSERT_GE(bin_id, gmat.cut.row_ptr[fid]);
|
||||||
|
ASSERT_LT(bin_id, gmat.cut.row_ptr[fid + 1]);
|
||||||
|
// The bin ID must correspond to a region between two
|
||||||
|
// suitable quantile points
|
||||||
|
ASSERT_LT(inst[j].fvalue, gmat.cut.cut[bin_id]);
|
||||||
|
if (bin_id > gmat.cut.row_ptr[fid]) {
|
||||||
|
ASSERT_GE(inst[j].fvalue, gmat.cut.cut[bin_id - 1]);
|
||||||
|
} else {
|
||||||
|
ASSERT_GE(inst[j].fvalue, gmat.cut.min_val[fid]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestBuildHist(int nid,
|
void TestBuildHist(int nid,
|
||||||
const GHistIndexMatrix& gmat,
|
const GHistIndexMatrix& gmat,
|
||||||
const DMatrix& fmat,
|
const DMatrix& fmat,
|
||||||
const RegTree& tree) {
|
const RegTree& tree) {
|
||||||
std::vector<GradientPair> gpair =
|
const std::vector<GradientPair> gpair =
|
||||||
{ {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
|
{ {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
|
||||||
{0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
|
{0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
|
||||||
RealImpl::InitData(gmat, gpair, fmat, tree);
|
RealImpl::InitData(gmat, gpair, fmat, tree);
|
||||||
GHistIndexBlockMatrix quantile_index_block;
|
GHistIndexBlockMatrix dummy;
|
||||||
hist_.AddHistRow(nid);
|
hist_.AddHistRow(nid);
|
||||||
BuildHist(gpair, row_set_collection_[nid],
|
BuildHist(gpair, row_set_collection_[nid],
|
||||||
gmat, quantile_index_block, hist_[nid], false);
|
gmat, dummy, hist_[nid], false);
|
||||||
std::vector<GradientPairPrecise> solution {
|
|
||||||
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.47f, 0.49f},
|
|
||||||
{0.27f, 0.29f}, {0.57f, 0.59f}, {0.26f, 0.27f},
|
|
||||||
{0.37f, 0.39f}, {0.23f, 0.24f}, {0.37f, 0.39f},
|
|
||||||
{0.27f, 0.28f}, {0.27f, 0.29f}, {0.37f, 0.39f},
|
|
||||||
{0.26f, 0.27f}, {0.23f, 0.24f}, {0.57f, 0.59f},
|
|
||||||
{0.47f, 0.49f}, {0.47f, 0.49f}, {0.37f, 0.39f},
|
|
||||||
{0.26f, 0.27f}, {0.23f, 0.24f}, {0.27f, 0.28f},
|
|
||||||
{0.57f, 0.59f}, {0.23f, 0.24f}, {0.47f, 0.49f}};
|
|
||||||
|
|
||||||
|
// Check if number of histogram bins is correct
|
||||||
|
ASSERT_EQ(hist_[nid].size(), gmat.cut.row_ptr.back());
|
||||||
|
std::vector<GradientPairPrecise> histogram_expected(hist_[nid].size());
|
||||||
|
|
||||||
|
// Compute the correct histogram (histogram_expected)
|
||||||
|
const size_t num_row = fmat.Info().num_row_;
|
||||||
|
CHECK_EQ(gpair.size(), num_row);
|
||||||
|
for (size_t rid = 0; rid < num_row; ++rid) {
|
||||||
|
const size_t ibegin = gmat.row_ptr[rid];
|
||||||
|
const size_t iend = gmat.row_ptr[rid + 1];
|
||||||
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
|
const size_t bin_id = gmat.index[i];
|
||||||
|
histogram_expected[bin_id] += GradientPairPrecise(gpair[rid]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now validate the computed histogram returned by BuildHist
|
||||||
for (size_t i = 0; i < hist_[nid].size(); ++i) {
|
for (size_t i = 0; i < hist_[nid].size(); ++i) {
|
||||||
GradientPairPrecise sol = solution[i];
|
GradientPairPrecise sol = histogram_expected[i];
|
||||||
ASSERT_NEAR(sol.GetGrad(), hist_[nid][i].GetGrad(), kEps);
|
ASSERT_NEAR(sol.GetGrad(), hist_[nid][i].GetGrad(), kEps);
|
||||||
ASSERT_NEAR(sol.GetHess(), hist_[nid][i].GetHess(), kEps);
|
ASSERT_NEAR(sol.GetHess(), hist_[nid][i].GetHess(), kEps);
|
||||||
}
|
}
|
||||||
@ -67,10 +131,11 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
void TestEvaluateSplit(const GHistIndexBlockMatrix& quantile_index_block,
|
void TestEvaluateSplit(const GHistIndexBlockMatrix& quantile_index_block,
|
||||||
const RegTree& tree) {
|
const RegTree& tree) {
|
||||||
std::vector<GradientPair> row_gpairs =
|
std::vector<GradientPair> row_gpairs =
|
||||||
{ {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
|
{ {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
|
||||||
{0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
|
{0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
|
||||||
size_t constexpr max_bins = 4;
|
size_t constexpr max_bins = 4;
|
||||||
auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3); // dense
|
auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3);
|
||||||
|
// dense, no missing values
|
||||||
|
|
||||||
common::GHistIndexMatrix gmat;
|
common::GHistIndexMatrix gmat;
|
||||||
gmat.Init((*dmat).get(), max_bins);
|
gmat.Init((*dmat).get(), max_bins);
|
||||||
@ -82,14 +147,67 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
gmat, quantile_index_block, hist_[0], false);
|
gmat, quantile_index_block, hist_[0], false);
|
||||||
|
|
||||||
RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree);
|
RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree);
|
||||||
// Manipulate the root_gain so that I don't have to invent an actual
|
|
||||||
// split. Yes, I'm cheating.
|
|
||||||
snode_[0].root_gain = 0.8;
|
|
||||||
RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree);
|
|
||||||
|
|
||||||
ASSERT_NEAR(snode_.at(0).best.loss_chg, 0.7128048, kEps);
|
/* Compute correct split (best_split) using the computed histogram */
|
||||||
ASSERT_EQ(snode_.at(0).best.SplitIndex(), 10);
|
const size_t num_row = dmat->get()->Info().num_row_;
|
||||||
ASSERT_NEAR(snode_.at(0).best.split_value, 0.182258, kEps);
|
const size_t num_feature = dmat->get()->Info().num_col_;
|
||||||
|
CHECK_EQ(num_row, row_gpairs.size());
|
||||||
|
// Compute total gradient for all data points
|
||||||
|
GradientPairPrecise total_gpair;
|
||||||
|
for (const auto& e : row_gpairs) {
|
||||||
|
total_gpair += GradientPairPrecise(e);
|
||||||
|
}
|
||||||
|
// Initialize split evaluator
|
||||||
|
std::unique_ptr<SplitEvaluator> evaluator(SplitEvaluator::Create("elastic_net"));
|
||||||
|
evaluator->Init({});
|
||||||
|
|
||||||
|
// Now enumerate all feature*threshold combination to get best split
|
||||||
|
// To simplify logic, we make some assumptions:
|
||||||
|
// 1) no missing values in data
|
||||||
|
// 2) no regularization, i.e. set min_child_weight, reg_lambda, reg_alpha,
|
||||||
|
// and max_delta_step to 0.
|
||||||
|
bst_float best_split_gain = 0.0f;
|
||||||
|
size_t best_split_threshold, best_split_feature;
|
||||||
|
// Enumerate all features
|
||||||
|
for (size_t fid = 0; fid < num_feature; ++fid) {
|
||||||
|
const size_t bin_id_min = gmat.cut.row_ptr[fid];
|
||||||
|
const size_t bin_id_max = gmat.cut.row_ptr[fid + 1];
|
||||||
|
// Enumerate all bin ID in [bin_id_min, bin_id_max), i.e. every possible
|
||||||
|
// choice of thresholds for feature fid
|
||||||
|
for (size_t split_thresh = bin_id_min;
|
||||||
|
split_thresh < bin_id_max; ++split_thresh) {
|
||||||
|
// left_sum, right_sum: Gradient sums for data points whose feature
|
||||||
|
// value is left/right side of the split threshold
|
||||||
|
GradientPairPrecise left_sum, right_sum;
|
||||||
|
for (size_t rid = 0; rid < num_row; ++rid) {
|
||||||
|
for (size_t offset = gmat.row_ptr[rid];
|
||||||
|
offset < gmat.row_ptr[rid + 1]; ++offset) {
|
||||||
|
const size_t bin_id = gmat.index[offset];
|
||||||
|
if (bin_id >= bin_id_min && bin_id < bin_id_max) {
|
||||||
|
if (bin_id <= split_thresh) {
|
||||||
|
left_sum += GradientPairPrecise(row_gpairs[rid]);
|
||||||
|
} else {
|
||||||
|
right_sum += GradientPairPrecise(row_gpairs[rid]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Now compute gain (change in loss)
|
||||||
|
const auto split_gain
|
||||||
|
= evaluator->ComputeSplitScore(0, fid, GradStats(left_sum),
|
||||||
|
GradStats(right_sum));
|
||||||
|
if (split_gain > best_split_gain) {
|
||||||
|
best_split_gain = split_gain;
|
||||||
|
best_split_feature = fid;
|
||||||
|
best_split_threshold = split_thresh;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now compare against result given by EvaluateSplit() */
|
||||||
|
RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree);
|
||||||
|
ASSERT_EQ(snode_[0].best.SplitIndex(), best_split_feature);
|
||||||
|
ASSERT_EQ(snode_[0].best.split_value, gmat.cut.cut[best_split_threshold]);
|
||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
}
|
}
|
||||||
@ -128,7 +246,7 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
||||||
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
||||||
|
|
||||||
builder_->TestInitData(gmat, gpair, *(*dmat), tree);
|
builder_->TestInitData(gmat, gpair, dmat->get(), tree);
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestBuildHist() {
|
void TestBuildHist() {
|
||||||
@ -169,7 +287,9 @@ TEST(Updater, QuantileHist_BuildHist) {
|
|||||||
TEST(Updater, QuantileHist_EvalSplits) {
|
TEST(Updater, QuantileHist_EvalSplits) {
|
||||||
std::vector<std::pair<std::string, std::string>> cfg
|
std::vector<std::pair<std::string, std::string>> cfg
|
||||||
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
|
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
|
||||||
{"split_evaluator", "elastic_net"}};
|
{"split_evaluator", "elastic_net"},
|
||||||
|
{"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"},
|
||||||
|
{"min_child_weight", "0"}};
|
||||||
QuantileHistMock maker(cfg);
|
QuantileHistMock maker(cfg);
|
||||||
maker.TestEvaluateSplit();
|
maker.TestEvaluateSplit();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import testing as tm
|
|||||||
import unittest
|
import unittest
|
||||||
import pytest
|
import pytest
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from regression_test_utilities import run_suite, parameter_combinations, \
|
from regression_test_utilities import run_suite, parameter_combinations, \
|
||||||
@ -59,3 +60,17 @@ class TestUpdaters(unittest.TestCase):
|
|||||||
evals_result=exact_res)
|
evals_result=exact_res)
|
||||||
assert hist_res['train']['auc'] == exact_res['train']['auc']
|
assert hist_res['train']['auc'] == exact_res['train']['auc']
|
||||||
assert hist_res['test']['auc'] == exact_res['test']['auc']
|
assert hist_res['test']['auc'] == exact_res['test']['auc']
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
|
def test_fast_histmaker_degenerate_case(self):
|
||||||
|
# Test a degenerate case where the quantile sketcher won't return any
|
||||||
|
# quantile points for a particular feature (the second feature in
|
||||||
|
# this example). Source: https://github.com/dmlc/xgboost/issues/2943
|
||||||
|
nan = np.nan
|
||||||
|
param = {'missing': nan, 'tree_method': 'hist'}
|
||||||
|
model = xgb.XGBRegressor(**param)
|
||||||
|
X = [[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
|
||||||
|
[6.38888889e+05, nan], [6.28086420e+05, nan]]
|
||||||
|
y = [1000000., 0., 0., 500000.]
|
||||||
|
w = [0, 0, 1, 0]
|
||||||
|
model.fit(X, y, sample_weight=w)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user