Support slicing tree model (#6302)

This PR is meant the end the confusion around best_ntree_limit and unify model slicing. We have multi-class and random forests, asking users to understand how to set ntree_limit is difficult and error prone. * Implement the save_best option in early stopping. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2020-11-03 02:27:39 -05:00
parent 29745c6df2
commit 2cc9662005
19 changed files with 550 additions and 37 deletions
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -29,7 +29,7 @@ def json_model(model_path, parameters):
    return model


-class TestModels(unittest.TestCase):
+class TestModels:
    def test_glm(self):
        param = {'verbosity': 0, 'objective': 'binary:logistic',
                 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
@@ -209,12 +209,14 @@ class TestModels(unittest.TestCase):

        bst = xgb.train([], dm1)
        bst.predict(dm1)  # success
-        self.assertRaises(ValueError, bst.predict, dm2)
+        with pytest.raises(ValueError):
+            bst.predict(dm2)
        bst.predict(dm1)  # success

        bst = xgb.train([], dm2)
        bst.predict(dm2)  # success
-        self.assertRaises(ValueError, bst.predict, dm1)
+        with pytest.raises(ValueError):
+            bst.predict(dm1)
        bst.predict(dm2)  # success

    def test_model_binary_io(self):
@@ -325,3 +327,96 @@ class TestModels(unittest.TestCase):
        parameters = {'tree_method': 'hist', 'booster': 'dart',
                      'objective': 'multi:softmax'}
        validate_model(parameters)
+
+    @pytest.mark.parametrize('booster', ['gbtree', 'dart'])
+    def test_slice(self, booster):
+        from sklearn.datasets import make_classification
+        num_classes = 3
+        X, y = make_classification(n_samples=1000, n_informative=5,
+                                   n_classes=num_classes)
+        dtrain = xgb.DMatrix(data=X, label=y)
+        num_parallel_tree = 4
+        num_boost_round = 16
+        total_trees = num_parallel_tree * num_classes * num_boost_round
+        booster = xgb.train({
+            'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3, 'booster': booster,
+            'objective': 'multi:softprob'},
+                            num_boost_round=num_boost_round, dtrain=dtrain)
+        assert len(booster.get_dump()) == total_trees
+        beg = 3
+        end = 7
+        sliced: xgb.Booster = booster[beg: end]
+
+        sliced_trees = (end - beg) * num_parallel_tree * num_classes
+        assert sliced_trees == len(sliced.get_dump())
+
+        sliced_trees = sliced_trees // 2
+        sliced: xgb.Booster = booster[beg: end: 2]
+        assert sliced_trees == len(sliced.get_dump())
+
+        sliced: xgb.Booster = booster[beg: ...]
+        sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
+        assert sliced_trees == len(sliced.get_dump())
+
+        sliced: xgb.Booster = booster[beg:]
+        sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
+        assert sliced_trees == len(sliced.get_dump())
+
+        sliced: xgb.Booster = booster[:end]
+        sliced_trees = end * num_parallel_tree * num_classes
+        assert sliced_trees == len(sliced.get_dump())
+
+        sliced: xgb.Booster = booster[...:end]
+        sliced_trees = end * num_parallel_tree * num_classes
+        assert sliced_trees == len(sliced.get_dump())
+
+        with pytest.raises(ValueError, match=r'>= 0'):
+            booster[-1: 0]
+
+        # we do not accept empty slice.
+        with pytest.raises(ValueError):
+            booster[1:1]
+        # stop can not be smaller than begin
+        with pytest.raises(ValueError, match=r'Invalid.*'):
+            booster[3:0]
+        with pytest.raises(ValueError, match=r'Invalid.*'):
+            booster[3:-1]
+        # negative step is not supported.
+        with pytest.raises(ValueError, match=r'.*>= 1.*'):
+            booster[0:2:-1]
+        # step can not be 0.
+        with pytest.raises(ValueError, match=r'.*>= 1.*'):
+            booster[0:2:0]
+
+        trees = [_ for _ in booster]
+        assert len(trees) == num_boost_round
+
+        with pytest.raises(TypeError):
+            booster["wrong type"]
+        with pytest.raises(IndexError):
+            booster[:num_boost_round+1]
+        with pytest.raises(ValueError):
+            booster[1, 2]       # too many dims
+        # setitem is not implemented as model is immutable during slicing.
+        with pytest.raises(TypeError):
+            booster[...:end] = booster
+
+        sliced_0 = booster[1:3]
+        sliced_1 = booster[3:7]
+
+        predt_0 = sliced_0.predict(dtrain, output_margin=True)
+        predt_1 = sliced_1.predict(dtrain, output_margin=True)
+
+        merged = predt_0 + predt_1 - 0.5  # base score.
+        single = booster[1:7].predict(dtrain, output_margin=True)
+        np.testing.assert_allclose(merged, single, atol=1e-6)
+
+        sliced_0 = booster[1:7:2]  # 1,3,5
+        sliced_1 = booster[2:8:2]  # 2,4,6
+
+        predt_0 = sliced_0.predict(dtrain, output_margin=True)
+        predt_1 = sliced_1.predict(dtrain, output_margin=True)
+
+        merged = predt_0 + predt_1 - 0.5
+        single = booster[1:7].predict(dtrain, output_margin=True)
+        np.testing.assert_allclose(merged, single, atol=1e-6)
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -113,6 +113,35 @@ class TestCallbacks(unittest.TestCase):
        dump = booster.get_dump(dump_format='json')
        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1

+    def test_early_stopping_save_best_model(self):
+        from sklearn.datasets import load_breast_cancer
+        X, y = load_breast_cancer(return_X_y=True)
+        n_estimators = 100
+        cls = xgb.XGBClassifier(n_estimators=n_estimators)
+        early_stopping_rounds = 5
+        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
+                                                save_best=True)
+        cls.fit(X, y, eval_set=[(X, y)],
+                eval_metric=tm.eval_error_metric, callbacks=[early_stop])
+        booster = cls.get_booster()
+        dump = booster.get_dump(dump_format='json')
+        assert len(dump) == booster.best_iteration
+
+        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
+                                                save_best=True)
+        cls = xgb.XGBClassifier(booster='gblinear', n_estimators=10)
+        self.assertRaises(ValueError, lambda: cls.fit(X, y, eval_set=[(X, y)],
+                                                      eval_metric=tm.eval_error_metric,
+                                                      callbacks=[early_stop]))
+
+        # No error
+        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
+                                                save_best=False)
+        xgb.XGBClassifier(booster='gblinear', n_estimators=10).fit(
+            X, y, eval_set=[(X, y)],
+            eval_metric=tm.eval_error_metric,
+            callbacks=[early_stop])
+
    def run_eta_decay(self, tree_method, deprecated_callback):
        if deprecated_callback:
            scheduler = xgb.callback.reset_learning_rate