[EM] Have one partitioner for each batch. (#10760)

- Initialize one partitioner for each batch. - Collect partition size during initialization. - Support base ridx in the finalization.
2024-08-29 01:35:17 +08:00
parent 3043827efc
commit 4fe67f10b4
10 changed files with 211 additions and 181 deletions
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -38,11 +38,9 @@ void VerifySampling(size_t page_size, float subsample, int sampling_method,
  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());

  if (fixed_size_sampling) {
-    EXPECT_EQ(sample.sample_rows, kRows);
    EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
    EXPECT_EQ(sample.gpair.size(), kRows);
  } else {
-    EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.03);
    EXPECT_NEAR(sample.p_fmat->Info().num_row_, sample_rows, kRows * 0.03f);
    EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.03f);
  }
@@ -89,7 +87,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
  GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
  auto p_fmat = sample.p_fmat;
-  EXPECT_EQ(sample.sample_rows, kRows);
+  EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
  EXPECT_EQ(sample.gpair.size(), gpair.Size());
  EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
  EXPECT_EQ(p_fmat->Info().num_row_, kRows);
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -12,7 +12,7 @@ from test_data_iterator import test_single_batch as cpu_single_batch


 def test_gpu_single_batch() -> None:
-    cpu_single_batch("gpu_hist")
+    cpu_single_batch("hist", "cuda")


@pytest.mark.skipif(**no_cupy())
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -17,7 +17,7 @@ from xgboost.testing.updater import check_quantile_loss_extmem
 pytestmark = tm.timeout(30)


-def test_single_batch(tree_method: str = "approx") -> None:
+def test_single_batch(tree_method: str = "approx", device: str = "cpu") -> None:
    from sklearn.datasets import load_breast_cancer

    n_rounds = 10
@@ -25,17 +25,19 @@ def test_single_batch(tree_method: str = "approx") -> None:
    X = X.astype(np.float32)
    y = y.astype(np.float32)

+    params = {"tree_method": tree_method, "device": device}
+
    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
-    from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    from_it = xgb.train(params, Xy, num_boost_round=n_rounds)

    Xy = xgb.DMatrix(X, y)
-    from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    from_dmat = xgb.train(params, Xy, num_boost_round=n_rounds)
    assert from_it.get_dump() == from_dmat.get_dump()

    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    X = X.astype(np.float32)
    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
-    from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    from_pd = xgb.train(params, Xy, num_boost_round=n_rounds)
    # remove feature info to generate exact same text representation.
    from_pd.feature_names = None
    from_pd.feature_types = None
@@ -45,11 +47,11 @@ def test_single_batch(tree_method: str = "approx") -> None:
    X, y = load_breast_cancer(return_X_y=True)
    X = csr_matrix(X)
    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
-    from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    from_it = xgb.train(params, Xy, num_boost_round=n_rounds)

    X, y = load_breast_cancer(return_X_y=True)
    Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
-    from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    from_np = xgb.train(params, Xy, num_boost_round=n_rounds)
    assert from_np.get_dump() == from_it.get_dump()