[EM] Have one partitioner for each batch. (#10760)

- Initialize one partitioner for each batch.
- Collect partition size during initialization.
- Support base ridx in the finalization.
This commit is contained in:
Jiaming Yuan
2024-08-29 01:35:17 +08:00
committed by GitHub
parent 3043827efc
commit 4fe67f10b4
10 changed files with 211 additions and 181 deletions

View File

@@ -17,7 +17,7 @@ from xgboost.testing.updater import check_quantile_loss_extmem
pytestmark = tm.timeout(30)
def test_single_batch(tree_method: str = "approx") -> None:
def test_single_batch(tree_method: str = "approx", device: str = "cpu") -> None:
from sklearn.datasets import load_breast_cancer
n_rounds = 10
@@ -25,17 +25,19 @@ def test_single_batch(tree_method: str = "approx") -> None:
X = X.astype(np.float32)
y = y.astype(np.float32)
params = {"tree_method": tree_method, "device": device}
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_it = xgb.train(params, Xy, num_boost_round=n_rounds)
Xy = xgb.DMatrix(X, y)
from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_dmat = xgb.train(params, Xy, num_boost_round=n_rounds)
assert from_it.get_dump() == from_dmat.get_dump()
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X = X.astype(np.float32)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_pd = xgb.train(params, Xy, num_boost_round=n_rounds)
# remove feature info to generate exact same text representation.
from_pd.feature_names = None
from_pd.feature_types = None
@@ -45,11 +47,11 @@ def test_single_batch(tree_method: str = "approx") -> None:
X, y = load_breast_cancer(return_X_y=True)
X = csr_matrix(X)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_it = xgb.train(params, Xy, num_boost_round=n_rounds)
X, y = load_breast_cancer(return_X_y=True)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_np = xgb.train(params, Xy, num_boost_round=n_rounds)
assert from_np.get_dump() == from_it.get_dump()