Obtain CSR matrix from DMatrix. (#8269)

2022-09-29 20:41:43 +08:00
parent b14c44ee5e
commit 55cf24cc32
22 changed files with 400 additions and 74 deletions
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -1,13 +1,14 @@
-# -*- coding: utf-8 -*-
 import os
 import tempfile
-import numpy as np
-import xgboost as xgb
-import scipy.sparse
-import pytest
-from scipy.sparse import rand, csr_matrix

+import numpy as np
+import pytest
+import scipy.sparse
 import testing as tm
+from hypothesis import given, settings, strategies
+from scipy.sparse import csr_matrix, rand
+
+import xgboost as xgb

 rng = np.random.RandomState(1)

@@ -433,3 +434,22 @@ class TestDMatrix:

    def test_base_margin(self):
        set_base_margin_info(np.asarray, xgb.DMatrix, "hist")
+
+    @given(
+        strategies.integers(0, 1000),
+        strategies.integers(0, 100),
+        strategies.fractions(0, 1),
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_to_csr(self, n_samples, n_features, sparsity) -> None:
+        if n_samples == 0 or n_features == 0 or sparsity == 1.0:
+            csr = scipy.sparse.csr_matrix(np.empty((0, 0)))
+        else:
+            csr = tm.make_sparse_regression(n_samples, n_features, sparsity, False)[
+                0
+            ].astype(np.float32)
+        m = xgb.DMatrix(data=csr)
+        ret = m.get_data()
+        np.testing.assert_equal(csr.indptr, ret.indptr)
+        np.testing.assert_equal(csr.data, ret.data)
+        np.testing.assert_equal(csr.indices, ret.indices)
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -1,9 +1,16 @@
-from typing import Dict, List, Any
+from typing import Any, Dict, List

 import numpy as np
 import pytest
+from hypothesis import given, settings, strategies
 from scipy import sparse
-from testing import IteratorForTest, make_batches, make_batches_sparse, make_categorical
+from testing import (
+    IteratorForTest,
+    make_batches,
+    make_batches_sparse,
+    make_categorical,
+    make_sparse_regression,
+)

 import xgboost as xgb

@@ -102,6 +109,7 @@ class TestQuantileDMatrix:
            )
            if tree_method == "gpu_hist":
                import cudf
+
                X = cudf.from_pandas(X)
                y = cudf.from_pandas(y)
        else:
@@ -154,6 +162,7 @@ class TestQuantileDMatrix:
            X, y = make_categorical(n_samples, n_features, 13, onehot=False)
            if tree_method == "gpu_hist":
                import cudf
+
                X = cudf.from_pandas(X)
                y = cudf.from_pandas(y)
        else:
@@ -198,9 +207,7 @@ class TestQuantileDMatrix:

    def test_predict(self) -> None:
        n_samples, n_features = 16, 2
-        X, y = make_categorical(
-            n_samples, n_features, n_categories=13, onehot=False
-        )
+        X, y = make_categorical(n_samples, n_features, n_categories=13, onehot=False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)

        booster = xgb.train({"tree_method": "hist"}, Xy)
@@ -210,3 +217,24 @@ class TestQuantileDMatrix:
        qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
        b = booster.predict(qXy)
        np.testing.assert_allclose(a, b)
+
+    # we don't test empty Quantile DMatrix in single node construction.
+    @given(
+        strategies.integers(1, 1000),
+        strategies.integers(1, 100),
+        strategies.fractions(0, 0.99),
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_to_csr(self, n_samples: int, n_features: int, sparsity: float) -> None:
+        csr, y = make_sparse_regression(n_samples, n_features, sparsity, False)
+        csr = csr.astype(np.float32)
+        qdm = xgb.QuantileDMatrix(data=csr, label=y)
+        ret = qdm.get_data()
+        np.testing.assert_equal(csr.indptr, ret.indptr)
+        np.testing.assert_equal(csr.indices, ret.indices)
+
+        booster = xgb.train({"tree_method": "hist"}, dtrain=qdm)
+
+        np.testing.assert_allclose(
+            booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data()))
+        )
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -577,6 +577,8 @@ def make_sparse_regression(

    if as_dense:
        arr = csr.toarray()
+        assert arr.shape[0] == n_samples
+        assert arr.shape[1] == n_features
        arr[arr == 0] = np.nan
        return arr, y