Support more scipy types. (#9881)

This commit is contained in:
Jiaming Yuan 2023-12-14 18:28:37 +08:00 committed by GitHub
parent cd473c9da3
commit 1aa8c8d9be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 143 additions and 50 deletions

View File

@ -57,12 +57,23 @@ def _check_data_shape(data: DataType) -> None:
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
def _is_scipy_csr(data: DataType) -> bool:
def is_scipy_csr(data: DataType) -> bool:
"""Predicate for scipy CSR input."""
is_array = False
is_matrix = False
try:
import scipy.sparse
from scipy.sparse import csr_array
is_array = isinstance(data, csr_array)
except ImportError:
return False
return isinstance(data, scipy.sparse.csr_matrix)
pass
try:
from scipy.sparse import csr_matrix
is_matrix = isinstance(data, csr_matrix)
except ImportError:
pass
return is_array or is_matrix
def _array_interface_dict(data: np.ndarray) -> dict:
@ -135,12 +146,23 @@ def _from_scipy_csr(
return handle, feature_names, feature_types
def _is_scipy_csc(data: DataType) -> bool:
def is_scipy_csc(data: DataType) -> bool:
"""Predicate for scipy CSC input."""
is_array = False
is_matrix = False
try:
import scipy.sparse
from scipy.sparse import csc_array
is_array = isinstance(data, csc_array)
except ImportError:
return False
return isinstance(data, scipy.sparse.csc_matrix)
pass
try:
from scipy.sparse import csc_matrix
is_matrix = isinstance(data, csc_matrix)
except ImportError:
pass
return is_array or is_matrix
def _from_scipy_csc(
@ -171,12 +193,23 @@ def _from_scipy_csc(
return handle, feature_names, feature_types
def _is_scipy_coo(data: DataType) -> bool:
def is_scipy_coo(data: DataType) -> bool:
"""Predicate for scipy COO input."""
is_array = False
is_matrix = False
try:
import scipy.sparse
from scipy.sparse import coo_array
is_array = isinstance(data, coo_array)
except ImportError:
return False
return isinstance(data, scipy.sparse.coo_matrix)
pass
try:
from scipy.sparse import coo_matrix
is_matrix = isinstance(data, coo_matrix)
except ImportError:
pass
return is_array or is_matrix
def _is_np_array_like(data: DataType) -> bool:
@ -1138,15 +1171,15 @@ def dispatch_data_backend(
"""Dispatch data for DMatrix."""
if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data)
if _is_scipy_csr(data):
if is_scipy_csr(data):
return _from_scipy_csr(
data, missing, threads, feature_names, feature_types, data_split_mode
)
if _is_scipy_csc(data):
if is_scipy_csc(data):
return _from_scipy_csc(
data, missing, threads, feature_names, feature_types, data_split_mode
)
if _is_scipy_coo(data):
if is_scipy_coo(data):
return _from_scipy_csr(
data.tocsr(),
missing,
@ -1396,9 +1429,15 @@ def _proxy_transform(
if _is_np_array_like(data):
data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
if is_scipy_csr(data):
data = transform_scipy_sparse(data, True)
return data, None, feature_names, feature_types
if is_scipy_csc(data):
data = transform_scipy_sparse(data.tocsr(), True)
return data, None, feature_names, feature_types
if is_scipy_coo(data):
data = transform_scipy_sparse(data.tocsr(), True)
return data, None, feature_names, feature_types
if _is_pandas_series(data):
import pandas as pd
@ -1451,7 +1490,7 @@ def dispatch_proxy_set_data(
_check_data_shape(data)
proxy._set_data_from_array(data) # pylint: disable=W0212
return
if _is_scipy_csr(data):
if is_scipy_csr(data):
proxy._set_data_from_csr(data) # pylint: disable=W0212
return
raise err

View File

@ -112,39 +112,6 @@ class TestDMatrix:
with pytest.raises(ValueError):
xgb.DMatrix(data)
def test_csr(self):
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
dtrain = xgb.DMatrix(X)
assert dtrain.num_row() == 3
assert dtrain.num_col() == 3
def test_csc(self):
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6])
X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
dtrain = xgb.DMatrix(X)
assert dtrain.num_row() == 3
assert dtrain.num_col() == 3
indptr = np.array([0, 3, 5])
data = np.array([0, 1, 2, 3, 4])
row_idx = np.array([0, 1, 2, 0, 2])
X = scipy.sparse.csc_matrix((data, row_idx, indptr), shape=(3, 2))
assert tm.predictor_equal(xgb.DMatrix(X.tocsr()), xgb.DMatrix(X))
def test_coo(self):
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6])
X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
dtrain = xgb.DMatrix(X)
assert dtrain.num_row() == 3
assert dtrain.num_col() == 3
def test_np_view(self):
# Sliced Float32 array
y = np.array([12, 34, 56], np.float32)[::2]

View File

@ -0,0 +1,87 @@
import itertools
import warnings
from typing import Type
import numpy as np
import pytest
import scipy.sparse
import xgboost as xgb
from xgboost import testing as tm
@pytest.mark.filterwarnings("error")
@pytest.mark.parametrize(
"DMatrixT,CSR",
[
(m, n)
for m, n in itertools.product(
(xgb.DMatrix, xgb.QuantileDMatrix),
(scipy.sparse.csr_matrix, scipy.sparse.csr_array),
)
],
)
def test_csr(DMatrixT: Type[xgb.DMatrix], CSR: Type) -> None:
with warnings.catch_warnings():
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
X = CSR((data, indices, indptr), shape=(3, 3))
dtrain = DMatrixT(X)
assert dtrain.num_row() == 3
assert dtrain.num_col() == 3
assert dtrain.num_nonmissing() == data.size
@pytest.mark.filterwarnings("error")
@pytest.mark.parametrize(
"DMatrixT,CSC",
[
(m, n)
for m, n in itertools.product(
(xgb.DMatrix, xgb.QuantileDMatrix),
(scipy.sparse.csc_matrix, scipy.sparse.csc_array),
)
],
)
def test_csc(DMatrixT: Type[xgb.DMatrix], CSC: Type) -> None:
with warnings.catch_warnings():
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6])
X = CSC((data, (row, col)), shape=(3, 3))
dtrain = DMatrixT(X)
assert dtrain.num_row() == 3
assert dtrain.num_col() == 3
assert dtrain.num_nonmissing() == data.size
indptr = np.array([0, 3, 5])
data = np.array([0, 1, 2, 3, 4])
row_idx = np.array([0, 1, 2, 0, 2])
X = CSC((data, row_idx, indptr), shape=(3, 2))
assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))
@pytest.mark.filterwarnings("error")
@pytest.mark.parametrize(
"DMatrixT,COO",
[
(m, n)
for m, n in itertools.product(
(xgb.DMatrix, xgb.QuantileDMatrix),
(scipy.sparse.coo_matrix, scipy.sparse.coo_array),
)
],
)
def test_coo(DMatrixT: Type[xgb.DMatrix], COO: Type) -> None:
with warnings.catch_warnings():
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6])
X = COO((data, (row, col)), shape=(3, 3))
dtrain = DMatrixT(X)
assert dtrain.num_row() == 3
assert dtrain.num_col() == 3
assert dtrain.num_nonmissing() == data.size
assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))