[EM] Make page concatenation optional. (#10826)

This PR introduces a new parameter `extmem_concat_pages` to make the page concatenation optional for GPU hist. In addition, the document is updated for the new GPU-based external memory.
2024-09-24 06:19:28 +08:00
parent 215da76263
commit e228c1a121
31 changed files with 690 additions and 388 deletions
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -496,7 +496,7 @@ auto MakeExtMemForTest(bst_idx_t n_samples, bst_feature_t n_features, Json dconf

  NumpyArrayIterForTest iter_1{0.0f, n_samples, n_features, n_batches};
  auto Xy = std::make_shared<data::SparsePageDMatrix>(
-      &iter_1, iter_1.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, "");
+      &iter_1, iter_1.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, "", false);
  MakeLabelForTest(Xy, p_fmat);
  return std::pair{p_fmat, Xy};
 }
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -37,7 +37,8 @@ void TestSparseDMatrixLoadFile(Context const* ctx) {
                            data::fileiter::Next,
                            std::numeric_limits<float>::quiet_NaN(),
                            n_threads,
-                            tmpdir.path + "cache"};
+                            tmpdir.path + "cache",
+                            false};
  ASSERT_EQ(AllThreadsForTest(), m.Ctx()->Threads());
  ASSERT_EQ(m.Info().num_col_, 5);
  ASSERT_EQ(m.Info().num_row_, 64);
@@ -364,9 +365,9 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
  CreateBigTestData(filename, 1 << 16);

  data::FileIterator iter(filename + "?format=libsvm", 0, 1);
-  std::unique_ptr<DMatrix> sparse{
-      new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
-                                  std::numeric_limits<float>::quiet_NaN(), threads, filename}};
+  std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
+      &iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
+      std::numeric_limits<float>::quiet_NaN(), threads, filename, false}};
  CHECK(sparse->Ctx()->Threads() == threads || sparse->Ctx()->Threads() == AllThreadsForTest());

  DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -81,10 +81,11 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {

  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};

-  GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
-  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
-  auto p_fmat = sample.p_fmat;
-  ASSERT_EQ(p_fmat, dmat.get());
+  ASSERT_THAT(
+      [&] {
+        GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
+      },
+      GMockThrow("extmem_concat_pages"));
 }

 TEST(GradientBasedSampler, UniformSampling) {
@@ -120,4 +121,4 @@ TEST(GradientBasedSampler, GradientBasedSamplingExternalMemory) {
  constexpr bool kFixedSizeSampling = false;
  VerifySampling(kPageSize, kSubsample, kSamplingMethod, kFixedSizeSampling);
 }
-};  // namespace xgboost::tree
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -23,7 +23,7 @@ namespace xgboost::tree {
 namespace {
 void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
                RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
-                const std::string& sampling_method, bst_bin_t max_bin) {
+                const std::string& sampling_method, bst_bin_t max_bin, bool concat_pages) {
  Args args{
      {"max_depth", "2"},
      {"max_bin", std::to_string(max_bin)},
@@ -38,13 +38,17 @@ void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix

  ObjInfo task{ObjInfo::kRegression};
  std::unique_ptr<TreeUpdater> hist_maker{TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
-  hist_maker->Configure(Args{});
+  if (subsample < 1.0) {
+    hist_maker->Configure(Args{{"extmem_concat_pages", std::to_string(concat_pages)}});
+  } else {
+    hist_maker->Configure(Args{});
+  }

  std::vector<HostDeviceVector<bst_node_t>> position(1);
  hist_maker->Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                     {tree});
  auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
-  if (subsample < 1.0 && !dmat->SingleColBlock()) {
+  if (subsample < 1.0 && !dmat->SingleColBlock() && concat_pages) {
    ASSERT_FALSE(hist_maker->UpdatePredictionCache(dmat, cache));
  } else {
    ASSERT_TRUE(hist_maker->UpdatePredictionCache(dmat, cache));
@@ -69,12 +73,12 @@ TEST(GpuHist, UniformSampling) {
  // Build a tree using the in-memory DMatrix.
  RegTree tree;
  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, false);
  // Build another tree using sampling.
  RegTree tree_sampling;
  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample, "uniform",
-             kRows);
+             kRows, false);

  // Make sure the predictions are the same.
  auto preds_h = preds.ConstHostVector();
@@ -100,13 +104,13 @@ TEST(GpuHist, GradientBasedSampling) {
  // Build a tree using the in-memory DMatrix.
  RegTree tree;
  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, false);

  // Build another tree using sampling.
  RegTree tree_sampling;
  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample,
-             "gradient_based", kRows);
+             "gradient_based", kRows, false);

  // Make sure the predictions are the same.
  auto preds_h = preds.ConstHostVector();
@@ -137,11 +141,11 @@ TEST(GpuHist, ExternalMemory) {
  // Build a tree using the in-memory DMatrix.
  RegTree tree;
  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, true);
  // Build another tree using multiple ELLPACK pages.
  RegTree tree_ext;
  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows, true);

  // Make sure the predictions are the same.
  auto preds_h = preds.ConstHostVector();
@@ -181,14 +185,14 @@ TEST(GpuHist, ExternalMemoryWithSampling) {

  RegTree tree;
  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows, true);

  // Build another tree using multiple ELLPACK pages.
  common::GlobalRandom() = rng;
  RegTree tree_ext;
  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, kSubsample, kSamplingMethod,
-             kRows);
+             kRows, true);

  Json jtree{Object{}};
  Json jtree_ext{Object{}};
@@ -228,6 +232,42 @@ TEST(GpuHist, MaxDepth) {
  ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }

+TEST(GpuHist, PageConcatConfig) {
+  auto ctx = MakeCUDACtx(0);
+  bst_idx_t n_samples = 64, n_features = 32;
+  auto p_fmat = RandomDataGenerator{n_samples, n_features, 0}.Batches(2).GenerateSparsePageDMatrix(
+      "temp", true);
+
+  auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
+  learner->SetParam("device", ctx.DeviceName());
+  learner->SetParam("extmem_concat_pages", "true");
+  learner->SetParam("subsample", "0.8");
+  learner->Configure();
+
+  learner->UpdateOneIter(0, p_fmat);
+  learner->SetParam("extmem_concat_pages", "false");
+  learner->Configure();
+  // GPU Hist rebuilds the updater after configuration. Training continues
+  learner->UpdateOneIter(1, p_fmat);
+
+  learner->SetParam("extmem_concat_pages", "true");
+  learner->SetParam("subsample", "1.0");
+  ASSERT_THAT([&] { learner->UpdateOneIter(2, p_fmat); }, GMockThrow("extmem_concat_pages"));
+
+  // Throws error on CPU.
+  {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
+    learner->SetParam("extmem_concat_pages", "true");
+    ASSERT_THAT([&] { learner->UpdateOneIter(0, p_fmat); }, GMockThrow("extmem_concat_pages"));
+  }
+  {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
+    learner->SetParam("extmem_concat_pages", "true");
+    learner->SetParam("tree_method", "approx");
+    ASSERT_THAT([&] { learner->UpdateOneIter(0, p_fmat); }, GMockThrow("extmem_concat_pages"));
+  }
+}
+
 namespace {
 RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
  ObjInfo task{ObjInfo::kRegression};
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -3,6 +3,8 @@ import sys
 import pytest
 from hypothesis import given, settings, strategies

+import xgboost as xgb
+from xgboost import testing as tm
 from xgboost.testing import no_cupy
 from xgboost.testing.updater import check_extmem_qdm, check_quantile_loss_extmem

@@ -72,6 +74,22 @@ def test_extmem_qdm(
    check_extmem_qdm(n_samples_per_batch, n_features, n_batches, "cuda", on_host)


+def test_concat_pages() -> None:
+    it = tm.IteratorForTest(*tm.make_batches(64, 16, 4, use_cupy=True), cache=None)
+    Xy = xgb.ExtMemQuantileDMatrix(it)
+    with pytest.raises(ValueError, match="can not be used with concatenated pages"):
+        booster = xgb.train(
+            {
+                "device": "cuda",
+                "subsample": 0.5,
+                "sampling_method": "gradient_based",
+                "extmem_concat_pages": True,
+                "objective": "reg:absoluteerror",
+            },
+            Xy,
+        )
+
+
@given(
    strategies.integers(1, 64),
    strategies.integers(1, 8),
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -6,24 +6,32 @@ import pytest

 from xgboost import testing as tm

-sys.path.append("tests/python")
-import test_demos as td  # noqa
+DEMO_DIR = tm.demo_dir(__file__)
+PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, "guide-python")


@pytest.mark.skipif(**tm.no_cupy())
 def test_data_iterator():
-    script = os.path.join(td.PYTHON_DEMO_DIR, "quantile_data_iterator.py")
+    script = os.path.join(PYTHON_DEMO_DIR, "quantile_data_iterator.py")
    cmd = ["python", script]
    subprocess.check_call(cmd)


 def test_update_process_demo():
-    script = os.path.join(td.PYTHON_DEMO_DIR, "update_process.py")
+    script = os.path.join(PYTHON_DEMO_DIR, "update_process.py")
    cmd = ["python", script]
    subprocess.check_call(cmd)


 def test_categorical_demo():
-    script = os.path.join(td.PYTHON_DEMO_DIR, "categorical.py")
+    script = os.path.join(PYTHON_DEMO_DIR, "categorical.py")
+    cmd = ["python", script]
+    subprocess.check_call(cmd)
+
+
+@pytest.mark.skipif(**tm.no_rmm())
+@pytest.mark.skipif(**tm.no_cupy())
+def test_external_memory_demo():
+    script = os.path.join(PYTHON_DEMO_DIR, "external_memory.py")
    cmd = ["python", script]
    subprocess.check_call(cmd)