Obtain CSR matrix from DMatrix. (#8269)

2022-09-29 20:41:43 +08:00
parent b14c44ee5e
commit 55cf24cc32
22 changed files with 400 additions and 74 deletions
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -125,6 +125,7 @@ if __name__ == "__main__":
                # tests
                "tests/python/test_config.py",
                "tests/python/test_spark/",
+                "tests/python/test_quantile_dmatrix.py",
                "tests/python-gpu/test_gpu_spark/",
                "tests/ci_build/lint_python.py",
                # demo
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -2,6 +2,8 @@ import numpy as np
 import xgboost as xgb
 import pytest
 import sys
+from hypothesis import given, strategies, settings
+from scipy import sparse

 sys.path.append("tests/python")
 import testing as tm
@@ -96,3 +98,42 @@ class TestDeviceQuantileDMatrix:
        import cupy as cp
        rng = cp.random.RandomState(1994)
        self.cputest.run_ref_dmatrix(rng, "gpu_hist", False)
+
+    @given(
+        strategies.integers(1, 1000),
+        strategies.integers(1, 100),
+        strategies.fractions(0, 0.99),
+    )
+    @settings(print_blob=True, deadline=None)
+    def test_to_csr(self, n_samples, n_features, sparsity) -> None:
+        import cupy as cp
+        X, y = tm.make_sparse_regression(
+            n_samples, n_features, sparsity, False
+        )
+        h_X = X.astype(np.float32)
+
+        csr = h_X
+        h_X = X.toarray()
+        h_X[h_X == 0] = np.nan
+
+        h_m = xgb.QuantileDMatrix(data=h_X)
+        h_ret = h_m.get_data()
+
+        d_X = cp.array(h_X)
+
+        d_m = xgb.QuantileDMatrix(data=d_X, label=y)
+        d_ret = d_m.get_data()
+
+        np.testing.assert_equal(csr.indptr, d_ret.indptr)
+        np.testing.assert_equal(csr.indices, d_ret.indices)
+
+        np.testing.assert_equal(h_ret.indptr, d_ret.indptr)
+        np.testing.assert_equal(h_ret.indices, d_ret.indices)
+
+        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain=d_m)
+
+        np.testing.assert_allclose(
+            booster.predict(d_m),
+            booster.predict(xgb.DMatrix(d_m.get_data())),
+            atol=1e-6
+        )
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -1,13 +1,14 @@
-# -*- coding: utf-8 -*-
 import os
 import tempfile
-import numpy as np
-import xgboost as xgb
-import scipy.sparse
-import pytest
-from scipy.sparse import rand, csr_matrix

+import numpy as np
+import pytest
+import scipy.sparse
 import testing as tm
+from hypothesis import given, settings, strategies
+from scipy.sparse import csr_matrix, rand
+
+import xgboost as xgb

 rng = np.random.RandomState(1)

@@ -433,3 +434,22 @@ class TestDMatrix:

    def test_base_margin(self):
        set_base_margin_info(np.asarray, xgb.DMatrix, "hist")
+
+    @given(
+        strategies.integers(0, 1000),
+        strategies.integers(0, 100),
+        strategies.fractions(0, 1),
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_to_csr(self, n_samples, n_features, sparsity) -> None:
+        if n_samples == 0 or n_features == 0 or sparsity == 1.0:
+            csr = scipy.sparse.csr_matrix(np.empty((0, 0)))
+        else:
+            csr = tm.make_sparse_regression(n_samples, n_features, sparsity, False)[
+                0
+            ].astype(np.float32)
+        m = xgb.DMatrix(data=csr)
+        ret = m.get_data()
+        np.testing.assert_equal(csr.indptr, ret.indptr)
+        np.testing.assert_equal(csr.data, ret.data)
+        np.testing.assert_equal(csr.indices, ret.indices)
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -1,9 +1,16 @@
-from typing import Dict, List, Any
+from typing import Any, Dict, List

 import numpy as np
 import pytest
+from hypothesis import given, settings, strategies
 from scipy import sparse
-from testing import IteratorForTest, make_batches, make_batches_sparse, make_categorical
+from testing import (
+    IteratorForTest,
+    make_batches,
+    make_batches_sparse,
+    make_categorical,
+    make_sparse_regression,
+)

 import xgboost as xgb

@@ -102,6 +109,7 @@ class TestQuantileDMatrix:
            )
            if tree_method == "gpu_hist":
                import cudf
+
                X = cudf.from_pandas(X)
                y = cudf.from_pandas(y)
        else:
@@ -154,6 +162,7 @@ class TestQuantileDMatrix:
            X, y = make_categorical(n_samples, n_features, 13, onehot=False)
            if tree_method == "gpu_hist":
                import cudf
+
                X = cudf.from_pandas(X)
                y = cudf.from_pandas(y)
        else:
@@ -198,9 +207,7 @@ class TestQuantileDMatrix:

    def test_predict(self) -> None:
        n_samples, n_features = 16, 2
-        X, y = make_categorical(
-            n_samples, n_features, n_categories=13, onehot=False
-        )
+        X, y = make_categorical(n_samples, n_features, n_categories=13, onehot=False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)

        booster = xgb.train({"tree_method": "hist"}, Xy)
@@ -210,3 +217,24 @@ class TestQuantileDMatrix:
        qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
        b = booster.predict(qXy)
        np.testing.assert_allclose(a, b)
+
+    # we don't test empty Quantile DMatrix in single node construction.
+    @given(
+        strategies.integers(1, 1000),
+        strategies.integers(1, 100),
+        strategies.fractions(0, 0.99),
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_to_csr(self, n_samples: int, n_features: int, sparsity: float) -> None:
+        csr, y = make_sparse_regression(n_samples, n_features, sparsity, False)
+        csr = csr.astype(np.float32)
+        qdm = xgb.QuantileDMatrix(data=csr, label=y)
+        ret = qdm.get_data()
+        np.testing.assert_equal(csr.indptr, ret.indptr)
+        np.testing.assert_equal(csr.indices, ret.indices)
+
+        booster = xgb.train({"tree_method": "hist"}, dtrain=qdm)
+
+        np.testing.assert_allclose(
+            booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data()))
+        )
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -577,6 +577,8 @@ def make_sparse_regression(

    if as_dense:
        arr = csr.toarray()
+        assert arr.shape[0] == n_samples
+        assert arr.shape[1] == n_features
        arr[arr == 0] = np.nan
        return arr, y