[EM] Have one partitioner for each batch. (#10760)

- Initialize one partitioner for each batch.
- Collect partition size during initialization.
- Support base ridx in the finalization.
This commit is contained in:
Jiaming Yuan
2024-08-29 01:35:17 +08:00
committed by GitHub
parent 3043827efc
commit 4fe67f10b4
10 changed files with 211 additions and 181 deletions

View File

@@ -38,11 +38,9 @@ void VerifySampling(size_t page_size, float subsample, int sampling_method,
auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
if (fixed_size_sampling) {
EXPECT_EQ(sample.sample_rows, kRows);
EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
EXPECT_EQ(sample.gpair.size(), kRows);
} else {
EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.03);
EXPECT_NEAR(sample.p_fmat->Info().num_row_, sample_rows, kRows * 0.03f);
EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.03f);
}
@@ -89,7 +87,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
auto p_fmat = sample.p_fmat;
EXPECT_EQ(sample.sample_rows, kRows);
EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
EXPECT_EQ(sample.gpair.size(), gpair.Size());
EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
EXPECT_EQ(p_fmat->Info().num_row_, kRows);

View File

@@ -12,7 +12,7 @@ from test_data_iterator import test_single_batch as cpu_single_batch
def test_gpu_single_batch() -> None:
cpu_single_batch("gpu_hist")
cpu_single_batch("hist", "cuda")
@pytest.mark.skipif(**no_cupy())

View File

@@ -17,7 +17,7 @@ from xgboost.testing.updater import check_quantile_loss_extmem
pytestmark = tm.timeout(30)
def test_single_batch(tree_method: str = "approx") -> None:
def test_single_batch(tree_method: str = "approx", device: str = "cpu") -> None:
from sklearn.datasets import load_breast_cancer
n_rounds = 10
@@ -25,17 +25,19 @@ def test_single_batch(tree_method: str = "approx") -> None:
X = X.astype(np.float32)
y = y.astype(np.float32)
params = {"tree_method": tree_method, "device": device}
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_it = xgb.train(params, Xy, num_boost_round=n_rounds)
Xy = xgb.DMatrix(X, y)
from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_dmat = xgb.train(params, Xy, num_boost_round=n_rounds)
assert from_it.get_dump() == from_dmat.get_dump()
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X = X.astype(np.float32)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_pd = xgb.train(params, Xy, num_boost_round=n_rounds)
# remove feature info to generate exact same text representation.
from_pd.feature_names = None
from_pd.feature_types = None
@@ -45,11 +47,11 @@ def test_single_batch(tree_method: str = "approx") -> None:
X, y = load_breast_cancer(return_X_y=True)
X = csr_matrix(X)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_it = xgb.train(params, Xy, num_boost_round=n_rounds)
X, y = load_breast_cancer(return_X_y=True)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
from_np = xgb.train(params, Xy, num_boost_round=n_rounds)
assert from_np.get_dump() == from_it.get_dump()