Bound the size of the histogram cache. (#9440)

- A new histogram collection with a limit in size. - Unify histogram building logic between hist, multi-hist, and approx.
2023-08-08 03:21:26 +08:00
parent 5bd163aa25
commit 54029a59af
27 changed files with 994 additions and 565 deletions
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -16,6 +16,7 @@ from xgboost.testing import (
    predictor_equal,
 )
 from xgboost.testing.data import check_inf, np_dtypes
+from xgboost.testing.data_iter import run_mixed_sparsity


 class TestQuantileDMatrix:
@@ -334,3 +335,6 @@ class TestQuantileDMatrix:

        with pytest.raises(ValueError, match="consistent"):
            xgb.train({}, Xy, num_boost_round=2, xgb_model=booster)
+
+    def test_mixed_sparsity(self) -> None:
+        run_mixed_sparsity("cpu")
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -11,6 +11,7 @@ from xgboost import testing as tm
 from xgboost.testing.params import (
    cat_parameter_strategy,
    exact_parameter_strategy,
+    hist_cache_strategy,
    hist_multi_parameter_strategy,
    hist_parameter_strategy,
 )
@@ -40,14 +41,22 @@ class TestTreeMethodMulti:
    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
+        hist_cache_strategy,
        strategies.integers(1, 20),
        tm.multi_dataset_strategy,
    )
    @settings(deadline=None, print_blob=True)
-    def test_approx(self, param, hist_param, num_rounds, dataset):
+    def test_approx(
+        self, param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
        param["tree_method"] = "approx"
        param = dataset.set_params(param)
        param.update(hist_param)
+        param.update(cache_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])
@@ -55,18 +64,25 @@ class TestTreeMethodMulti:
    @given(
        exact_parameter_strategy,
        hist_multi_parameter_strategy,
+        hist_cache_strategy,
        strategies.integers(1, 20),
        tm.multi_dataset_strategy,
    )
    @settings(deadline=None, print_blob=True)
    def test_hist(
-        self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
+        self,
+        param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
    ) -> None:
        if dataset.name.endswith("-l1"):
            return
        param["tree_method"] = "hist"
        param = dataset.set_params(param)
        param.update(hist_param)
+        param.update(cache_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])
@@ -91,14 +107,23 @@ class TestTreeMethod:
    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
+        hist_cache_strategy,
        strategies.integers(1, 20),
        tm.make_dataset_strategy(),
    )
    @settings(deadline=None, print_blob=True)
-    def test_approx(self, param, hist_param, num_rounds, dataset):
+    def test_approx(
+        self,
+        param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
        param["tree_method"] = "approx"
        param = dataset.set_params(param)
        param.update(hist_param)
+        param.update(cache_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])
@@ -130,17 +155,25 @@ class TestTreeMethod:
    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
+        hist_cache_strategy,
        strategies.integers(1, 20),
        tm.make_dataset_strategy()
    )
    @settings(deadline=None, print_blob=True)
-    def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
-        param['tree_method'] = 'hist'
+    def test_hist(
+        self, param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
+        param["tree_method"] = "hist"
        param = dataset.set_params(param)
        param.update(hist_param)
+        param.update(cache_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])

    def test_hist_categorical(self):
        # hist must be same as exact on all-categorial data