[EM] Have one partitioner for each batch. (#10760)
- Initialize one partitioner for each batch. - Collect partition size during initialization. - Support base ridx in the finalization.
This commit is contained in:
@@ -38,11 +38,9 @@ void VerifySampling(size_t page_size, float subsample, int sampling_method,
|
||||
auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
|
||||
|
||||
if (fixed_size_sampling) {
|
||||
EXPECT_EQ(sample.sample_rows, kRows);
|
||||
EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
|
||||
EXPECT_EQ(sample.gpair.size(), kRows);
|
||||
} else {
|
||||
EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.03);
|
||||
EXPECT_NEAR(sample.p_fmat->Info().num_row_, sample_rows, kRows * 0.03f);
|
||||
EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.03f);
|
||||
}
|
||||
@@ -89,7 +87,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
|
||||
GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
|
||||
auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
|
||||
auto p_fmat = sample.p_fmat;
|
||||
EXPECT_EQ(sample.sample_rows, kRows);
|
||||
EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
|
||||
EXPECT_EQ(sample.gpair.size(), gpair.Size());
|
||||
EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
|
||||
EXPECT_EQ(p_fmat->Info().num_row_, kRows);
|
||||
|
||||
@@ -12,7 +12,7 @@ from test_data_iterator import test_single_batch as cpu_single_batch
|
||||
|
||||
|
||||
def test_gpu_single_batch() -> None:
|
||||
cpu_single_batch("gpu_hist")
|
||||
cpu_single_batch("hist", "cuda")
|
||||
|
||||
|
||||
@pytest.mark.skipif(**no_cupy())
|
||||
|
||||
@@ -17,7 +17,7 @@ from xgboost.testing.updater import check_quantile_loss_extmem
|
||||
pytestmark = tm.timeout(30)
|
||||
|
||||
|
||||
def test_single_batch(tree_method: str = "approx") -> None:
|
||||
def test_single_batch(tree_method: str = "approx", device: str = "cpu") -> None:
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
n_rounds = 10
|
||||
@@ -25,17 +25,19 @@ def test_single_batch(tree_method: str = "approx") -> None:
|
||||
X = X.astype(np.float32)
|
||||
y = y.astype(np.float32)
|
||||
|
||||
params = {"tree_method": tree_method, "device": device}
|
||||
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
|
||||
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
from_it = xgb.train(params, Xy, num_boost_round=n_rounds)
|
||||
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
from_dmat = xgb.train(params, Xy, num_boost_round=n_rounds)
|
||||
assert from_it.get_dump() == from_dmat.get_dump()
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
|
||||
X = X.astype(np.float32)
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
|
||||
from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
from_pd = xgb.train(params, Xy, num_boost_round=n_rounds)
|
||||
# remove feature info to generate exact same text representation.
|
||||
from_pd.feature_names = None
|
||||
from_pd.feature_types = None
|
||||
@@ -45,11 +47,11 @@ def test_single_batch(tree_method: str = "approx") -> None:
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X = csr_matrix(X)
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
|
||||
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
from_it = xgb.train(params, Xy, num_boost_round=n_rounds)
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
|
||||
from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
from_np = xgb.train(params, Xy, num_boost_round=n_rounds)
|
||||
assert from_np.get_dump() == from_it.get_dump()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user