Support slicing tree model (#6302)

This PR is meant the end the confusion around best_ntree_limit and unify model slicing. We have multi-class and random forests, asking users to understand how to set ntree_limit is difficult and error prone.

* Implement the save_best option in early stopping.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2020-11-03 02:27:39 -05:00
committed by GitHub
parent 29745c6df2
commit 2cc9662005
19 changed files with 550 additions and 37 deletions

View File

@@ -154,9 +154,9 @@ TEST(GBTree, JsonIO) {
ASSERT_EQ(get<String>(model["model"]["name"]), "gbtree");
auto const& gbtree_model = model["model"]["model"];
ASSERT_EQ(get<Array>(gbtree_model["trees"]).size(), 1);
ASSERT_EQ(get<Array>(gbtree_model["trees"]).size(), 1ul);
ASSERT_EQ(get<Integer>(get<Object>(get<Array>(gbtree_model["trees"]).front()).at("id")), 0);
ASSERT_EQ(get<Array>(gbtree_model["tree_info"]).size(), 1);
ASSERT_EQ(get<Array>(gbtree_model["tree_info"]).size(), 1ul);
auto j_train_param = model["config"]["gbtree_train_param"];
ASSERT_EQ(get<String>(j_train_param["num_parallel_tree"]), "1");
@@ -194,7 +194,7 @@ TEST(Dart, JsonIO) {
ASSERT_EQ(get<String>(model["model"]["name"]), "dart") << model;
ASSERT_EQ(get<String>(model["config"]["name"]), "dart");
ASSERT_TRUE(IsA<Object>(model["model"]["gbtree"]));
ASSERT_NE(get<Array>(model["model"]["weight_drop"]).size(), 0);
ASSERT_NE(get<Array>(model["model"]["weight_drop"]).size(), 0ul);
}
TEST(Dart, Prediction) {
@@ -230,4 +230,122 @@ TEST(Dart, Prediction) {
ASSERT_GT(std::abs(h_predts_training[i] - h_predts_inference[i]), kRtEps);
}
}
std::pair<Json, Json> TestModelSlice(std::string booster) {
size_t constexpr kRows = 1000, kCols = 100, kForest = 2, kClasses = 3;
auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true, false, kClasses);
int32_t kIters = 10;
std::unique_ptr<Learner> learner {
Learner::Create({m})
};
learner->SetParams(Args{{"booster", booster},
{"tree_method", "hist"},
{"num_parallel_tree", std::to_string(kForest)},
{"num_class", std::to_string(kClasses)},
{"subsample", "0.5"},
{"max_depth", "2"}});
for (auto i = 0; i < kIters; ++i) {
learner->UpdateOneIter(i, m);
}
Json model{Object()};
Json config{Object()};
learner->SaveModel(&model);
learner->SaveConfig(&config);
bool out_of_bound = false;
size_t constexpr kSliceStart = 2, kSliceEnd = 8, kStep = 3;
std::unique_ptr<Learner> sliced {learner->Slice(kSliceStart, kSliceEnd, kStep, &out_of_bound)};
Json sliced_model{Object()};
sliced->SaveModel(&sliced_model);
auto get_shape = [&](Json const& model) {
if (booster == "gbtree") {
return get<Object const>(model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]);
} else {
return get<Object const>(model["learner"]["gradient_booster"]["gbtree"]["model"]["gbtree_model_param"]);
}
};
auto const& model_shape = get_shape(sliced_model);
CHECK_EQ(get<String const>(model_shape.at("num_trees")), std::to_string(2 * kClasses * kForest));
Json sliced_config {Object()};
sliced->SaveConfig(&sliced_config);
CHECK_EQ(sliced_config, config);
auto get_trees = [&](Json const& model) {
if (booster == "gbtree") {
return get<Array const>(model["learner"]["gradient_booster"]["model"]["trees"]);
} else {
return get<Array const>(model["learner"]["gradient_booster"]["gbtree"]["model"]["trees"]);
}
};
auto get_info = [&](Json const& model) {
if (booster == "gbtree") {
return get<Array const>(model["learner"]["gradient_booster"]["model"]["tree_info"]);
} else {
return get<Array const>(model["learner"]["gradient_booster"]["gbtree"]["model"]["tree_info"]);
}
};
auto const &sliced_trees = get_trees(sliced_model);
CHECK_EQ(sliced_trees.size(), 2 * kClasses * kForest);
auto constexpr kLayerSize = kClasses * kForest;
auto const &sliced_info = get_info(sliced_model);
for (size_t layer = 0; layer < 2; ++layer) {
for (size_t j = 0; j < kClasses; ++j) {
for (size_t k = 0; k < kForest; ++k) {
auto idx = layer * kLayerSize + j * kForest + k;
auto const &group = get<Integer const>(sliced_info.at(idx));
CHECK_EQ(static_cast<size_t>(group), j);
}
}
}
auto const& trees = get_trees(model);
// Sliced layers are [2, 5]
auto begin = kLayerSize * kSliceStart;
auto end = begin + kLayerSize;
auto j = 0;
for (size_t i = begin; i < end; ++i) {
Json tree = trees[i];
tree["id"] = Integer(0); // id is different, we set it to 0 to allow comparison.
auto sliced_tree = sliced_trees[j];
sliced_tree["id"] = Integer(0);
CHECK_EQ(tree, sliced_tree);
j++;
}
begin = kLayerSize * (kSliceStart + kStep);
end = begin + kLayerSize;
for (size_t i = begin; i < end; ++i) {
Json tree = trees[i];
tree["id"] = Integer(0);
auto sliced_tree = sliced_trees[j];
sliced_tree["id"] = Integer(0);
CHECK_EQ(tree, sliced_tree);
j++;
}
return std::make_pair(model, sliced_model);
}
TEST(GBTree, Slice) {
TestModelSlice("gbtree");
}
TEST(Dart, Slice) {
Json model, sliced_model;
std::tie(model, sliced_model) = TestModelSlice("dart");
auto const& weights = get<Array const>(model["learner"]["gradient_booster"]["weight_drop"]);
auto const& trees = get<Array const>(model["learner"]["gradient_booster"]["gbtree"]["model"]["trees"]);
ASSERT_EQ(weights.size(), trees.size());
}
} // namespace xgboost

View File

@@ -118,7 +118,7 @@ TEST(Learner, Configuration) {
// eval_metric is not part of configuration
auto attr_names = learner->GetConfigurationArguments();
ASSERT_EQ(attr_names.size(), 1);
ASSERT_EQ(attr_names.size(), 1ul);
ASSERT_EQ(attr_names.find(emetric), attr_names.cend());
ASSERT_EQ(attr_names.at("foo"), "bar");
}
@@ -127,7 +127,7 @@ TEST(Learner, Configuration) {
std::unique_ptr<Learner> learner { Learner::Create({nullptr}) };
learner->SetParams({{"foo", "bar"}, {emetric, "auc"}, {emetric, "entropy"}, {emetric, "KL"}});
auto attr_names = learner->GetConfigurationArguments();
ASSERT_EQ(attr_names.size(), 1);
ASSERT_EQ(attr_names.size(), 1ul);
ASSERT_EQ(attr_names.at("foo"), "bar");
}
}
@@ -181,7 +181,7 @@ TEST(Learner, JsonModelIO) {
learner->SaveModel(&new_in);
ASSERT_TRUE(IsA<Object>(out["learner"]["attributes"]));
ASSERT_EQ(get<Object>(out["learner"]["attributes"]).size(), 1);
ASSERT_EQ(get<Object>(out["learner"]["attributes"]).size(), 1ul);
ASSERT_EQ(out, new_in);
}
}
@@ -333,5 +333,4 @@ TEST(Learner, Seed) {
ASSERT_EQ(std::to_string(seed),
get<String>(config["learner"]["generic_param"]["seed"]));
}
} // namespace xgboost

View File

@@ -29,7 +29,7 @@ def json_model(model_path, parameters):
return model
class TestModels(unittest.TestCase):
class TestModels:
def test_glm(self):
param = {'verbosity': 0, 'objective': 'binary:logistic',
'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
@@ -209,12 +209,14 @@ class TestModels(unittest.TestCase):
bst = xgb.train([], dm1)
bst.predict(dm1) # success
self.assertRaises(ValueError, bst.predict, dm2)
with pytest.raises(ValueError):
bst.predict(dm2)
bst.predict(dm1) # success
bst = xgb.train([], dm2)
bst.predict(dm2) # success
self.assertRaises(ValueError, bst.predict, dm1)
with pytest.raises(ValueError):
bst.predict(dm1)
bst.predict(dm2) # success
def test_model_binary_io(self):
@@ -325,3 +327,96 @@ class TestModels(unittest.TestCase):
parameters = {'tree_method': 'hist', 'booster': 'dart',
'objective': 'multi:softmax'}
validate_model(parameters)
@pytest.mark.parametrize('booster', ['gbtree', 'dart'])
def test_slice(self, booster):
from sklearn.datasets import make_classification
num_classes = 3
X, y = make_classification(n_samples=1000, n_informative=5,
n_classes=num_classes)
dtrain = xgb.DMatrix(data=X, label=y)
num_parallel_tree = 4
num_boost_round = 16
total_trees = num_parallel_tree * num_classes * num_boost_round
booster = xgb.train({
'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3, 'booster': booster,
'objective': 'multi:softprob'},
num_boost_round=num_boost_round, dtrain=dtrain)
assert len(booster.get_dump()) == total_trees
beg = 3
end = 7
sliced: xgb.Booster = booster[beg: end]
sliced_trees = (end - beg) * num_parallel_tree * num_classes
assert sliced_trees == len(sliced.get_dump())
sliced_trees = sliced_trees // 2
sliced: xgb.Booster = booster[beg: end: 2]
assert sliced_trees == len(sliced.get_dump())
sliced: xgb.Booster = booster[beg: ...]
sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
assert sliced_trees == len(sliced.get_dump())
sliced: xgb.Booster = booster[beg:]
sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
assert sliced_trees == len(sliced.get_dump())
sliced: xgb.Booster = booster[:end]
sliced_trees = end * num_parallel_tree * num_classes
assert sliced_trees == len(sliced.get_dump())
sliced: xgb.Booster = booster[...:end]
sliced_trees = end * num_parallel_tree * num_classes
assert sliced_trees == len(sliced.get_dump())
with pytest.raises(ValueError, match=r'>= 0'):
booster[-1: 0]
# we do not accept empty slice.
with pytest.raises(ValueError):
booster[1:1]
# stop can not be smaller than begin
with pytest.raises(ValueError, match=r'Invalid.*'):
booster[3:0]
with pytest.raises(ValueError, match=r'Invalid.*'):
booster[3:-1]
# negative step is not supported.
with pytest.raises(ValueError, match=r'.*>= 1.*'):
booster[0:2:-1]
# step can not be 0.
with pytest.raises(ValueError, match=r'.*>= 1.*'):
booster[0:2:0]
trees = [_ for _ in booster]
assert len(trees) == num_boost_round
with pytest.raises(TypeError):
booster["wrong type"]
with pytest.raises(IndexError):
booster[:num_boost_round+1]
with pytest.raises(ValueError):
booster[1, 2] # too many dims
# setitem is not implemented as model is immutable during slicing.
with pytest.raises(TypeError):
booster[...:end] = booster
sliced_0 = booster[1:3]
sliced_1 = booster[3:7]
predt_0 = sliced_0.predict(dtrain, output_margin=True)
predt_1 = sliced_1.predict(dtrain, output_margin=True)
merged = predt_0 + predt_1 - 0.5 # base score.
single = booster[1:7].predict(dtrain, output_margin=True)
np.testing.assert_allclose(merged, single, atol=1e-6)
sliced_0 = booster[1:7:2] # 1,3,5
sliced_1 = booster[2:8:2] # 2,4,6
predt_0 = sliced_0.predict(dtrain, output_margin=True)
predt_1 = sliced_1.predict(dtrain, output_margin=True)
merged = predt_0 + predt_1 - 0.5
single = booster[1:7].predict(dtrain, output_margin=True)
np.testing.assert_allclose(merged, single, atol=1e-6)

View File

@@ -113,6 +113,35 @@ class TestCallbacks(unittest.TestCase):
dump = booster.get_dump(dump_format='json')
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_save_best_model(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
n_estimators = 100
cls = xgb.XGBClassifier(n_estimators=n_estimators)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=True)
cls.fit(X, y, eval_set=[(X, y)],
eval_metric=tm.eval_error_metric, callbacks=[early_stop])
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
assert len(dump) == booster.best_iteration
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=True)
cls = xgb.XGBClassifier(booster='gblinear', n_estimators=10)
self.assertRaises(ValueError, lambda: cls.fit(X, y, eval_set=[(X, y)],
eval_metric=tm.eval_error_metric,
callbacks=[early_stop]))
# No error
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=False)
xgb.XGBClassifier(booster='gblinear', n_estimators=10).fit(
X, y, eval_set=[(X, y)],
eval_metric=tm.eval_error_metric,
callbacks=[early_stop])
def run_eta_decay(self, tree_method, deprecated_callback):
if deprecated_callback:
scheduler = xgb.callback.reset_learning_rate