Support CPU input for device QuantileDMatrix. (#8136)

- Copy `GHistIndexMatrix` to `Ellpack` when needed.
2022-08-11 21:21:26 +08:00
parent 36e7c5364d
commit 16bca5d4a1
11 changed files with 220 additions and 19 deletions
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -121,7 +121,6 @@ if __name__ == "__main__":
                "python-package/xgboost/sklearn.py",
                "python-package/xgboost/spark",
                "python-package/xgboost/federated.py",
-                "python-package/xgboost/spark",
                # tests
                "tests/python/test_config.py",
                "tests/python/test_spark/",
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -236,4 +236,45 @@ TEST(EllpackPage, Compact) {
    }
  }
 }
+
+namespace {
+class EllpackPageTest : public testing::TestWithParam<float> {
+ protected:
+  void Run(float sparsity) {
+    // Only testing with small sample size as the cuts might be different between host and
+    // device.
+    size_t n_samples{128}, n_features{13};
+    Context ctx;
+    ctx.gpu_id = 0;
+    auto Xy = RandomDataGenerator{n_samples, n_features, sparsity}.GenerateDMatrix(true);
+    std::unique_ptr<EllpackPageImpl> from_ghist;
+    ASSERT_TRUE(Xy->SingleColBlock());
+    for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(BatchParam{17, 0.6})) {
+      from_ghist.reset(new EllpackPageImpl{&ctx, page, {}});
+    }
+
+    for (auto const& page : Xy->GetBatches<EllpackPage>(BatchParam{0, 17})) {
+      auto from_sparse_page = page.Impl();
+      ASSERT_EQ(from_sparse_page->is_dense, from_ghist->is_dense);
+      ASSERT_EQ(from_sparse_page->base_rowid, 0);
+      ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
+      ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
+      ASSERT_EQ(from_sparse_page->gidx_buffer.Size(), from_ghist->gidx_buffer.Size());
+      auto const& h_gidx_from_sparse = from_sparse_page->gidx_buffer.HostVector();
+      auto const& h_gidx_from_ghist = from_ghist->gidx_buffer.HostVector();
+      ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
+      common::CompressedIterator<uint32_t> from_ghist_it(h_gidx_from_ghist.data(),
+                                                         from_ghist->NumSymbols());
+      common::CompressedIterator<uint32_t> from_sparse_it(h_gidx_from_sparse.data(),
+                                                          from_sparse_page->NumSymbols());
+      for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
+        EXPECT_EQ(from_ghist_it[i], from_sparse_it[i]);
+      }
+    }
+  }
+};
+}  // namespace
+
+TEST_P(EllpackPageTest, FromGHistIndex) { this->Run(GetParam()); }
+INSTANTIATE_TEST_SUITE_P(EllpackPage, EllpackPageTest, testing::Values(.0f, .2f, .4f, .8f));
 }  // namespace xgboost
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -31,6 +31,34 @@ class TestDeviceQuantileDMatrix:
        data = cp.random.randn(5, 5)
        xgb.DeviceQuantileDMatrix(data, cp.ones(5, dtype=np.float64))

+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_from_host(self) -> None:
+        import cupy as cp
+        n_samples = 64
+        n_features = 3
+        X, y, w = tm.make_batches(
+            n_samples, n_features=n_features, n_batches=1, use_cupy=False
+        )
+        Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0])
+        booster_0 = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=4)
+
+        X[0] = cp.array(X[0])
+        y[0] = cp.array(y[0])
+        w[0] = cp.array(w[0])
+
+        Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0])
+        booster_1 = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=4)
+        cp.testing.assert_allclose(
+            booster_0.inplace_predict(X[0]), booster_1.inplace_predict(X[0])
+        )
+
+        with pytest.raises(ValueError, match="not initialized with CPU"):
+            # Training on CPU with GPU data is not supported.
+            xgb.train({"tree_method": "hist"}, Xy, num_boost_round=4)
+
+        with pytest.raises(ValueError, match=r"Only.*hist.*"):
+            xgb.train({"tree_method": "approx"}, Xy, num_boost_round=4)
+
    @pytest.mark.skipif(**tm.no_cupy())
    def test_metainfo(self) -> None:
        import cupy as cp