Move num_parallel_tree to model parameter. (#7751)

The size of forest should be a property of model itself instead of a training hyper-parameter.
2022-03-29 02:32:42 +08:00
parent 8b3ecfca25
commit 3c9b04460a
11 changed files with 158 additions and 101 deletions
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -8,8 +8,6 @@ import locale
 import tempfile

 dpath = os.path.join(tm.PROJECT_ROOT, 'demo/data/')
-dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')

 rng = np.random.RandomState(1994)

@@ -38,6 +36,8 @@ class TestModels:
        param = {'verbosity': 0, 'objective': 'binary:logistic',
                 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
                 'nthread': 1}
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 4
        bst = xgb.train(param, dtrain, num_round, watchlist)
@@ -124,7 +124,7 @@ class TestModels:
        predt_1 = bst.predict(margined)

        assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
-
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
        predt_2 = bst.predict(dtrain)
        assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
@@ -150,6 +150,8 @@ class TestModels:
            'objective': 'reg:logistic',
            "tree_method": tree_method
        }
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 10

@@ -195,6 +197,8 @@ class TestModels:
        self.run_custom_objective()

    def test_multi_eval_metric(self):
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
                 'objective': 'binary:logistic'}
@@ -216,6 +220,7 @@ class TestModels:
            param['scale_pos_weight'] = ratio
            return (dtrain, dtest, param)

+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        xgb.cv(param, dtrain, num_round, nfold=5,
               metrics={'auc'}, seed=0, fpreproc=fpreproc)

@@ -223,6 +228,7 @@ class TestModels:
        param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                 'objective': 'binary:logistic'}
        num_round = 2
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        xgb.cv(param, dtrain, num_round, nfold=5,
               metrics={'error'}, seed=0, show_stdv=False)

@@ -331,6 +337,7 @@ class TestModels:
        os.remove(model_path)

        try:
+            dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
            xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
        except ValueError as e:
            e_str = str(e)
@@ -422,68 +429,58 @@ class TestModels:
            assert cls.get_booster().best_ntree_limit == 2
            assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit

-    @pytest.mark.skipif(**tm.no_sklearn())
-    @pytest.mark.parametrize('booster', ['gbtree', 'dart'])
-    def test_slice(self, booster):
-        from sklearn.datasets import make_classification
-        num_classes = 3
-        X, y = make_classification(n_samples=1000, n_informative=5,
-                                   n_classes=num_classes)
-        dtrain = xgb.DMatrix(data=X, label=y)
-        num_parallel_tree = 4
-        num_boost_round = 16
-        total_trees = num_parallel_tree * num_classes * num_boost_round
-        booster = xgb.train({
-            'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3, 'booster': booster,
-            'objective': 'multi:softprob'},
-                            num_boost_round=num_boost_round, dtrain=dtrain)
-        booster.feature_types = ["q"] * X.shape[1]
-
-        assert len(booster.get_dump()) == total_trees
+    def run_slice(
+        self,
+        booster: xgb.Booster,
+        dtrain: xgb.DMatrix,
+        num_parallel_tree: int,
+        num_classes: int,
+        num_boost_round: int
+    ):
        beg = 3
        end = 7
-        sliced: xgb.Booster = booster[beg: end]
+        sliced: xgb.Booster = booster[beg:end]
        assert sliced.feature_types == booster.feature_types

        sliced_trees = (end - beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced_trees = sliced_trees // 2
-        sliced: xgb.Booster = booster[beg: end: 2]
+        sliced = booster[beg:end:2]
        assert sliced_trees == len(sliced.get_dump())

-        sliced: xgb.Booster = booster[beg: ...]
+        sliced = booster[beg: ...]
        sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

-        sliced: xgb.Booster = booster[beg:]
+        sliced = booster[beg:]
        sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

-        sliced: xgb.Booster = booster[:end]
+        sliced = booster[:end]
        sliced_trees = end * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

-        sliced: xgb.Booster = booster[...:end]
+        sliced = booster[...: end]
        sliced_trees = end * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

-        with pytest.raises(ValueError, match=r'>= 0'):
-            booster[-1: 0]
+        with pytest.raises(ValueError, match=r">= 0"):
+            booster[-1:0]

        # we do not accept empty slice.
        with pytest.raises(ValueError):
            booster[1:1]
        # stop can not be smaller than begin
-        with pytest.raises(ValueError, match=r'Invalid.*'):
+        with pytest.raises(ValueError, match=r"Invalid.*"):
            booster[3:0]
-        with pytest.raises(ValueError, match=r'Invalid.*'):
+        with pytest.raises(ValueError, match=r"Invalid.*"):
            booster[3:-1]
        # negative step is not supported.
-        with pytest.raises(ValueError, match=r'.*>= 1.*'):
+        with pytest.raises(ValueError, match=r".*>= 1.*"):
            booster[0:2:-1]
        # step can not be 0.
-        with pytest.raises(ValueError, match=r'.*>= 1.*'):
+        with pytest.raises(ValueError, match=r".*>= 1.*"):
            booster[0:2:0]

        trees = [_ for _ in booster]
@@ -492,12 +489,12 @@ class TestModels:
        with pytest.raises(TypeError):
            booster["wrong type"]
        with pytest.raises(IndexError):
-            booster[:num_boost_round+1]
+            booster[: num_boost_round + 1]
        with pytest.raises(ValueError):
-            booster[1, 2]       # too many dims
+            booster[1, 2]  # too many dims
        # setitem is not implemented as model is immutable during slicing.
        with pytest.raises(TypeError):
-            booster[...:end] = booster
+            booster[...: end] = booster

        sliced_0 = booster[1:3]
        np.testing.assert_allclose(
@@ -525,6 +522,44 @@ class TestModels:
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)

+    @pytest.mark.skipif(**tm.no_sklearn())
+    @pytest.mark.parametrize("booster", ["gbtree", "dart"])
+    def test_slice(self, booster):
+        from sklearn.datasets import make_classification
+
+        num_classes = 3
+        X, y = make_classification(
+            n_samples=1000, n_informative=5, n_classes=num_classes
+        )
+        dtrain = xgb.DMatrix(data=X, label=y)
+        num_parallel_tree = 4
+        num_boost_round = 16
+        total_trees = num_parallel_tree * num_classes * num_boost_round
+        booster = xgb.train(
+            {
+                "num_parallel_tree": num_parallel_tree,
+                "subsample": 0.5,
+                "num_class": num_classes,
+                "booster": booster,
+                "objective": "multi:softprob",
+            },
+            num_boost_round=num_boost_round,
+            dtrain=dtrain,
+        )
+        booster.feature_types = ["q"] * X.shape[1]
+
+        assert len(booster.get_dump()) == total_trees
+
+        self.run_slice(booster, dtrain, num_parallel_tree, num_classes, num_boost_round)
+
+        bytesarray = booster.save_raw(raw_format="ubj")
+        booster = xgb.Booster(model_file=bytesarray)
+        self.run_slice(booster, dtrain, num_parallel_tree, num_classes, num_boost_round)
+
+        bytesarray = booster.save_raw(raw_format="deprecated")
+        booster = xgb.Booster(model_file=bytesarray)
+        self.run_slice(booster, dtrain, num_parallel_tree, num_classes, num_boost_round)
+
    @pytest.mark.skipif(**tm.no_pandas())
    def test_feature_info(self):
        import pandas as pd
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -530,7 +530,7 @@ def test_dask_regressor(model: str, client: "Client") -> None:
    forest = int(
        json.loads(regressor.get_booster().save_config())["learner"][
            "gradient_booster"
-        ]["gbtree_train_param"]["num_parallel_tree"]
+        ]["gbtree_model_param"]["num_parallel_tree"]
    )

    if model == "boosting":
@@ -584,7 +584,7 @@ def run_dask_classifier(
    assert n_threads != 0 and n_threads != os.cpu_count()

    forest = int(
-        config["learner"]["gradient_booster"]["gbtree_train_param"]["num_parallel_tree"]
+        config["learner"]["gradient_booster"]["gbtree_model_param"]["num_parallel_tree"]
    )
    if model == "boosting":
        assert len(history["validation_0"][metric]) == 2
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -329,21 +329,27 @@ def test_select_feature():

 def test_num_parallel_tree():
    from sklearn.datasets import fetch_california_housing
-    reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4,
-                           tree_method='hist')
+
+    reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method="hist")
    X, y = fetch_california_housing(return_X_y=True)
    bst = reg.fit(X=X, y=y)
-    dump = bst.get_booster().get_dump(dump_format='json')
+    dump = bst.get_booster().get_dump(dump_format="json")
    assert len(dump) == 16

    reg = xgb.XGBRFRegressor(n_estimators=4)
    bst = reg.fit(X=X, y=y)
-    dump = bst.get_booster().get_dump(dump_format='json')
+    dump = bst.get_booster().get_dump(dump_format="json")
    assert len(dump) == 4

    config = json.loads(bst.get_booster().save_config())
-    assert int(config['learner']['gradient_booster']['gbtree_train_param'][
-        'num_parallel_tree']) == 4
+    assert (
+        int(
+            config["learner"]["gradient_booster"]["gbtree_model_param"][
+                "num_parallel_tree"
+            ]
+        )
+        == 4
+    )


 def test_calif_housing_regression():