Bound the size of the histogram cache. (#9440)

- A new histogram collection with a limit in size.
- Unify histogram building logic between hist, multi-hist, and approx.
This commit is contained in:
Jiaming Yuan
2023-08-08 03:21:26 +08:00
committed by GitHub
parent 5bd163aa25
commit 54029a59af
27 changed files with 994 additions and 565 deletions

View File

@@ -16,6 +16,7 @@ from xgboost.testing import (
predictor_equal,
)
from xgboost.testing.data import check_inf, np_dtypes
from xgboost.testing.data_iter import run_mixed_sparsity
class TestQuantileDMatrix:
@@ -334,3 +335,6 @@ class TestQuantileDMatrix:
with pytest.raises(ValueError, match="consistent"):
xgb.train({}, Xy, num_boost_round=2, xgb_model=booster)
def test_mixed_sparsity(self) -> None:
run_mixed_sparsity("cpu")

View File

@@ -11,6 +11,7 @@ from xgboost import testing as tm
from xgboost.testing.params import (
cat_parameter_strategy,
exact_parameter_strategy,
hist_cache_strategy,
hist_multi_parameter_strategy,
hist_parameter_strategy,
)
@@ -40,14 +41,22 @@ class TestTreeMethodMulti:
@given(
exact_parameter_strategy,
hist_parameter_strategy,
hist_cache_strategy,
strategies.integers(1, 20),
tm.multi_dataset_strategy,
)
@settings(deadline=None, print_blob=True)
def test_approx(self, param, hist_param, num_rounds, dataset):
def test_approx(
self, param: Dict[str, Any],
hist_param: Dict[str, Any],
cache_param: Dict[str, Any],
num_rounds: int,
dataset: tm.TestDataset,
) -> None:
param["tree_method"] = "approx"
param = dataset.set_params(param)
param.update(hist_param)
param.update(cache_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result["train"][dataset.metric])
@@ -55,18 +64,25 @@ class TestTreeMethodMulti:
@given(
exact_parameter_strategy,
hist_multi_parameter_strategy,
hist_cache_strategy,
strategies.integers(1, 20),
tm.multi_dataset_strategy,
)
@settings(deadline=None, print_blob=True)
def test_hist(
self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
self,
param: Dict[str, Any],
hist_param: Dict[str, Any],
cache_param: Dict[str, Any],
num_rounds: int,
dataset: tm.TestDataset,
) -> None:
if dataset.name.endswith("-l1"):
return
param["tree_method"] = "hist"
param = dataset.set_params(param)
param.update(hist_param)
param.update(cache_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result["train"][dataset.metric])
@@ -91,14 +107,23 @@ class TestTreeMethod:
@given(
exact_parameter_strategy,
hist_parameter_strategy,
hist_cache_strategy,
strategies.integers(1, 20),
tm.make_dataset_strategy(),
)
@settings(deadline=None, print_blob=True)
def test_approx(self, param, hist_param, num_rounds, dataset):
def test_approx(
self,
param: Dict[str, Any],
hist_param: Dict[str, Any],
cache_param: Dict[str, Any],
num_rounds: int,
dataset: tm.TestDataset,
) -> None:
param["tree_method"] = "approx"
param = dataset.set_params(param)
param.update(hist_param)
param.update(cache_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result["train"][dataset.metric])
@@ -130,17 +155,25 @@ class TestTreeMethod:
@given(
exact_parameter_strategy,
hist_parameter_strategy,
hist_cache_strategy,
strategies.integers(1, 20),
tm.make_dataset_strategy()
)
@settings(deadline=None, print_blob=True)
def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
param['tree_method'] = 'hist'
def test_hist(
self, param: Dict[str, Any],
hist_param: Dict[str, Any],
cache_param: Dict[str, Any],
num_rounds: int,
dataset: tm.TestDataset,
) -> None:
param["tree_method"] = "hist"
param = dataset.set_params(param)
param.update(hist_param)
param.update(cache_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result['train'][dataset.metric])
assert tm.non_increasing(result["train"][dataset.metric])
def test_hist_categorical(self):
# hist must be same as exact on all-categorial data