diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 74b2966fe..05337e788 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -57,12 +57,23 @@ def _check_data_shape(data: DataType) -> None: raise ValueError("Please reshape the input data into 2-dimensional matrix.") -def _is_scipy_csr(data: DataType) -> bool: +def is_scipy_csr(data: DataType) -> bool: + """Predicate for scipy CSR input.""" + is_array = False + is_matrix = False try: - import scipy.sparse + from scipy.sparse import csr_array + + is_array = isinstance(data, csr_array) except ImportError: - return False - return isinstance(data, scipy.sparse.csr_matrix) + pass + try: + from scipy.sparse import csr_matrix + + is_matrix = isinstance(data, csr_matrix) + except ImportError: + pass + return is_array or is_matrix def _array_interface_dict(data: np.ndarray) -> dict: @@ -135,12 +146,23 @@ def _from_scipy_csr( return handle, feature_names, feature_types -def _is_scipy_csc(data: DataType) -> bool: +def is_scipy_csc(data: DataType) -> bool: + """Predicate for scipy CSC input.""" + is_array = False + is_matrix = False try: - import scipy.sparse + from scipy.sparse import csc_array + + is_array = isinstance(data, csc_array) except ImportError: - return False - return isinstance(data, scipy.sparse.csc_matrix) + pass + try: + from scipy.sparse import csc_matrix + + is_matrix = isinstance(data, csc_matrix) + except ImportError: + pass + return is_array or is_matrix def _from_scipy_csc( @@ -171,12 +193,23 @@ def _from_scipy_csc( return handle, feature_names, feature_types -def _is_scipy_coo(data: DataType) -> bool: +def is_scipy_coo(data: DataType) -> bool: + """Predicate for scipy COO input.""" + is_array = False + is_matrix = False try: - import scipy.sparse + from scipy.sparse import coo_array + + is_array = isinstance(data, coo_array) except ImportError: - return False - return isinstance(data, scipy.sparse.coo_matrix) + pass + try: + from scipy.sparse import coo_matrix + + is_matrix = isinstance(data, coo_matrix) + except ImportError: + pass + return is_array or is_matrix def _is_np_array_like(data: DataType) -> bool: @@ -1138,15 +1171,15 @@ def dispatch_data_backend( """Dispatch data for DMatrix.""" if not _is_cudf_ser(data) and not _is_pandas_series(data): _check_data_shape(data) - if _is_scipy_csr(data): + if is_scipy_csr(data): return _from_scipy_csr( data, missing, threads, feature_names, feature_types, data_split_mode ) - if _is_scipy_csc(data): + if is_scipy_csc(data): return _from_scipy_csc( data, missing, threads, feature_names, feature_types, data_split_mode ) - if _is_scipy_coo(data): + if is_scipy_coo(data): return _from_scipy_csr( data.tocsr(), missing, @@ -1396,9 +1429,15 @@ def _proxy_transform( if _is_np_array_like(data): data, _ = _ensure_np_dtype(data, data.dtype) return data, None, feature_names, feature_types - if _is_scipy_csr(data): + if is_scipy_csr(data): data = transform_scipy_sparse(data, True) return data, None, feature_names, feature_types + if is_scipy_csc(data): + data = transform_scipy_sparse(data.tocsr(), True) + return data, None, feature_names, feature_types + if is_scipy_coo(data): + data = transform_scipy_sparse(data.tocsr(), True) + return data, None, feature_names, feature_types if _is_pandas_series(data): import pandas as pd @@ -1451,7 +1490,7 @@ def dispatch_proxy_set_data( _check_data_shape(data) proxy._set_data_from_array(data) # pylint: disable=W0212 return - if _is_scipy_csr(data): + if is_scipy_csr(data): proxy._set_data_from_csr(data) # pylint: disable=W0212 return raise err diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 05a9af3b0..2e8a1a2a6 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -112,39 +112,6 @@ class TestDMatrix: with pytest.raises(ValueError): xgb.DMatrix(data) - def test_csr(self): - indptr = np.array([0, 2, 3, 6]) - indices = np.array([0, 2, 2, 0, 1, 2]) - data = np.array([1, 2, 3, 4, 5, 6]) - X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3)) - dtrain = xgb.DMatrix(X) - assert dtrain.num_row() == 3 - assert dtrain.num_col() == 3 - - def test_csc(self): - row = np.array([0, 2, 2, 0, 1, 2]) - col = np.array([0, 0, 1, 2, 2, 2]) - data = np.array([1, 2, 3, 4, 5, 6]) - X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3)) - dtrain = xgb.DMatrix(X) - assert dtrain.num_row() == 3 - assert dtrain.num_col() == 3 - - indptr = np.array([0, 3, 5]) - data = np.array([0, 1, 2, 3, 4]) - row_idx = np.array([0, 1, 2, 0, 2]) - X = scipy.sparse.csc_matrix((data, row_idx, indptr), shape=(3, 2)) - assert tm.predictor_equal(xgb.DMatrix(X.tocsr()), xgb.DMatrix(X)) - - def test_coo(self): - row = np.array([0, 2, 2, 0, 1, 2]) - col = np.array([0, 0, 1, 2, 2, 2]) - data = np.array([1, 2, 3, 4, 5, 6]) - X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3)) - dtrain = xgb.DMatrix(X) - assert dtrain.num_row() == 3 - assert dtrain.num_col() == 3 - def test_np_view(self): # Sliced Float32 array y = np.array([12, 34, 56], np.float32)[::2] diff --git a/tests/python/test_with_scipy.py b/tests/python/test_with_scipy.py new file mode 100644 index 000000000..ab54d2a43 --- /dev/null +++ b/tests/python/test_with_scipy.py @@ -0,0 +1,87 @@ +import itertools +import warnings +from typing import Type + +import numpy as np +import pytest +import scipy.sparse + +import xgboost as xgb +from xgboost import testing as tm + + +@pytest.mark.filterwarnings("error") +@pytest.mark.parametrize( + "DMatrixT,CSR", + [ + (m, n) + for m, n in itertools.product( + (xgb.DMatrix, xgb.QuantileDMatrix), + (scipy.sparse.csr_matrix, scipy.sparse.csr_array), + ) + ], +) +def test_csr(DMatrixT: Type[xgb.DMatrix], CSR: Type) -> None: + with warnings.catch_warnings(): + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + X = CSR((data, indices, indptr), shape=(3, 3)) + dtrain = DMatrixT(X) + assert dtrain.num_row() == 3 + assert dtrain.num_col() == 3 + assert dtrain.num_nonmissing() == data.size + + +@pytest.mark.filterwarnings("error") +@pytest.mark.parametrize( + "DMatrixT,CSC", + [ + (m, n) + for m, n in itertools.product( + (xgb.DMatrix, xgb.QuantileDMatrix), + (scipy.sparse.csc_matrix, scipy.sparse.csc_array), + ) + ], +) +def test_csc(DMatrixT: Type[xgb.DMatrix], CSC: Type) -> None: + with warnings.catch_warnings(): + row = np.array([0, 2, 2, 0, 1, 2]) + col = np.array([0, 0, 1, 2, 2, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + X = CSC((data, (row, col)), shape=(3, 3)) + dtrain = DMatrixT(X) + assert dtrain.num_row() == 3 + assert dtrain.num_col() == 3 + assert dtrain.num_nonmissing() == data.size + + indptr = np.array([0, 3, 5]) + data = np.array([0, 1, 2, 3, 4]) + row_idx = np.array([0, 1, 2, 0, 2]) + X = CSC((data, row_idx, indptr), shape=(3, 2)) + assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X)) + + +@pytest.mark.filterwarnings("error") +@pytest.mark.parametrize( + "DMatrixT,COO", + [ + (m, n) + for m, n in itertools.product( + (xgb.DMatrix, xgb.QuantileDMatrix), + (scipy.sparse.coo_matrix, scipy.sparse.coo_array), + ) + ], +) +def test_coo(DMatrixT: Type[xgb.DMatrix], COO: Type) -> None: + with warnings.catch_warnings(): + row = np.array([0, 2, 2, 0, 1, 2]) + col = np.array([0, 0, 1, 2, 2, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + X = COO((data, (row, col)), shape=(3, 3)) + dtrain = DMatrixT(X) + assert dtrain.num_row() == 3 + assert dtrain.num_col() == 3 + assert dtrain.num_nonmissing() == data.size + + assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))