Support more scipy types. (#9881)
This commit is contained in:
parent
cd473c9da3
commit
1aa8c8d9be
@ -57,12 +57,23 @@ def _check_data_shape(data: DataType) -> None:
|
|||||||
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
|
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
|
||||||
|
|
||||||
|
|
||||||
def _is_scipy_csr(data: DataType) -> bool:
|
def is_scipy_csr(data: DataType) -> bool:
|
||||||
|
"""Predicate for scipy CSR input."""
|
||||||
|
is_array = False
|
||||||
|
is_matrix = False
|
||||||
try:
|
try:
|
||||||
import scipy.sparse
|
from scipy.sparse import csr_array
|
||||||
|
|
||||||
|
is_array = isinstance(data, csr_array)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False
|
pass
|
||||||
return isinstance(data, scipy.sparse.csr_matrix)
|
try:
|
||||||
|
from scipy.sparse import csr_matrix
|
||||||
|
|
||||||
|
is_matrix = isinstance(data, csr_matrix)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
return is_array or is_matrix
|
||||||
|
|
||||||
|
|
||||||
def _array_interface_dict(data: np.ndarray) -> dict:
|
def _array_interface_dict(data: np.ndarray) -> dict:
|
||||||
@ -135,12 +146,23 @@ def _from_scipy_csr(
|
|||||||
return handle, feature_names, feature_types
|
return handle, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
def _is_scipy_csc(data: DataType) -> bool:
|
def is_scipy_csc(data: DataType) -> bool:
|
||||||
|
"""Predicate for scipy CSC input."""
|
||||||
|
is_array = False
|
||||||
|
is_matrix = False
|
||||||
try:
|
try:
|
||||||
import scipy.sparse
|
from scipy.sparse import csc_array
|
||||||
|
|
||||||
|
is_array = isinstance(data, csc_array)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False
|
pass
|
||||||
return isinstance(data, scipy.sparse.csc_matrix)
|
try:
|
||||||
|
from scipy.sparse import csc_matrix
|
||||||
|
|
||||||
|
is_matrix = isinstance(data, csc_matrix)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
return is_array or is_matrix
|
||||||
|
|
||||||
|
|
||||||
def _from_scipy_csc(
|
def _from_scipy_csc(
|
||||||
@ -171,12 +193,23 @@ def _from_scipy_csc(
|
|||||||
return handle, feature_names, feature_types
|
return handle, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
def _is_scipy_coo(data: DataType) -> bool:
|
def is_scipy_coo(data: DataType) -> bool:
|
||||||
|
"""Predicate for scipy COO input."""
|
||||||
|
is_array = False
|
||||||
|
is_matrix = False
|
||||||
try:
|
try:
|
||||||
import scipy.sparse
|
from scipy.sparse import coo_array
|
||||||
|
|
||||||
|
is_array = isinstance(data, coo_array)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False
|
pass
|
||||||
return isinstance(data, scipy.sparse.coo_matrix)
|
try:
|
||||||
|
from scipy.sparse import coo_matrix
|
||||||
|
|
||||||
|
is_matrix = isinstance(data, coo_matrix)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
return is_array or is_matrix
|
||||||
|
|
||||||
|
|
||||||
def _is_np_array_like(data: DataType) -> bool:
|
def _is_np_array_like(data: DataType) -> bool:
|
||||||
@ -1138,15 +1171,15 @@ def dispatch_data_backend(
|
|||||||
"""Dispatch data for DMatrix."""
|
"""Dispatch data for DMatrix."""
|
||||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||||
_check_data_shape(data)
|
_check_data_shape(data)
|
||||||
if _is_scipy_csr(data):
|
if is_scipy_csr(data):
|
||||||
return _from_scipy_csr(
|
return _from_scipy_csr(
|
||||||
data, missing, threads, feature_names, feature_types, data_split_mode
|
data, missing, threads, feature_names, feature_types, data_split_mode
|
||||||
)
|
)
|
||||||
if _is_scipy_csc(data):
|
if is_scipy_csc(data):
|
||||||
return _from_scipy_csc(
|
return _from_scipy_csc(
|
||||||
data, missing, threads, feature_names, feature_types, data_split_mode
|
data, missing, threads, feature_names, feature_types, data_split_mode
|
||||||
)
|
)
|
||||||
if _is_scipy_coo(data):
|
if is_scipy_coo(data):
|
||||||
return _from_scipy_csr(
|
return _from_scipy_csr(
|
||||||
data.tocsr(),
|
data.tocsr(),
|
||||||
missing,
|
missing,
|
||||||
@ -1396,9 +1429,15 @@ def _proxy_transform(
|
|||||||
if _is_np_array_like(data):
|
if _is_np_array_like(data):
|
||||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||||
return data, None, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_scipy_csr(data):
|
if is_scipy_csr(data):
|
||||||
data = transform_scipy_sparse(data, True)
|
data = transform_scipy_sparse(data, True)
|
||||||
return data, None, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
|
if is_scipy_csc(data):
|
||||||
|
data = transform_scipy_sparse(data.tocsr(), True)
|
||||||
|
return data, None, feature_names, feature_types
|
||||||
|
if is_scipy_coo(data):
|
||||||
|
data = transform_scipy_sparse(data.tocsr(), True)
|
||||||
|
return data, None, feature_names, feature_types
|
||||||
if _is_pandas_series(data):
|
if _is_pandas_series(data):
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@ -1451,7 +1490,7 @@ def dispatch_proxy_set_data(
|
|||||||
_check_data_shape(data)
|
_check_data_shape(data)
|
||||||
proxy._set_data_from_array(data) # pylint: disable=W0212
|
proxy._set_data_from_array(data) # pylint: disable=W0212
|
||||||
return
|
return
|
||||||
if _is_scipy_csr(data):
|
if is_scipy_csr(data):
|
||||||
proxy._set_data_from_csr(data) # pylint: disable=W0212
|
proxy._set_data_from_csr(data) # pylint: disable=W0212
|
||||||
return
|
return
|
||||||
raise err
|
raise err
|
||||||
|
|||||||
@ -112,39 +112,6 @@ class TestDMatrix:
|
|||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
xgb.DMatrix(data)
|
xgb.DMatrix(data)
|
||||||
|
|
||||||
def test_csr(self):
|
|
||||||
indptr = np.array([0, 2, 3, 6])
|
|
||||||
indices = np.array([0, 2, 2, 0, 1, 2])
|
|
||||||
data = np.array([1, 2, 3, 4, 5, 6])
|
|
||||||
X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
|
|
||||||
dtrain = xgb.DMatrix(X)
|
|
||||||
assert dtrain.num_row() == 3
|
|
||||||
assert dtrain.num_col() == 3
|
|
||||||
|
|
||||||
def test_csc(self):
|
|
||||||
row = np.array([0, 2, 2, 0, 1, 2])
|
|
||||||
col = np.array([0, 0, 1, 2, 2, 2])
|
|
||||||
data = np.array([1, 2, 3, 4, 5, 6])
|
|
||||||
X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
|
|
||||||
dtrain = xgb.DMatrix(X)
|
|
||||||
assert dtrain.num_row() == 3
|
|
||||||
assert dtrain.num_col() == 3
|
|
||||||
|
|
||||||
indptr = np.array([0, 3, 5])
|
|
||||||
data = np.array([0, 1, 2, 3, 4])
|
|
||||||
row_idx = np.array([0, 1, 2, 0, 2])
|
|
||||||
X = scipy.sparse.csc_matrix((data, row_idx, indptr), shape=(3, 2))
|
|
||||||
assert tm.predictor_equal(xgb.DMatrix(X.tocsr()), xgb.DMatrix(X))
|
|
||||||
|
|
||||||
def test_coo(self):
|
|
||||||
row = np.array([0, 2, 2, 0, 1, 2])
|
|
||||||
col = np.array([0, 0, 1, 2, 2, 2])
|
|
||||||
data = np.array([1, 2, 3, 4, 5, 6])
|
|
||||||
X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
|
|
||||||
dtrain = xgb.DMatrix(X)
|
|
||||||
assert dtrain.num_row() == 3
|
|
||||||
assert dtrain.num_col() == 3
|
|
||||||
|
|
||||||
def test_np_view(self):
|
def test_np_view(self):
|
||||||
# Sliced Float32 array
|
# Sliced Float32 array
|
||||||
y = np.array([12, 34, 56], np.float32)[::2]
|
y = np.array([12, 34, 56], np.float32)[::2]
|
||||||
|
|||||||
87
tests/python/test_with_scipy.py
Normal file
87
tests/python/test_with_scipy.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
import itertools
|
||||||
|
import warnings
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import scipy.sparse
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
from xgboost import testing as tm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("error")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"DMatrixT,CSR",
|
||||||
|
[
|
||||||
|
(m, n)
|
||||||
|
for m, n in itertools.product(
|
||||||
|
(xgb.DMatrix, xgb.QuantileDMatrix),
|
||||||
|
(scipy.sparse.csr_matrix, scipy.sparse.csr_array),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_csr(DMatrixT: Type[xgb.DMatrix], CSR: Type) -> None:
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
indptr = np.array([0, 2, 3, 6])
|
||||||
|
indices = np.array([0, 2, 2, 0, 1, 2])
|
||||||
|
data = np.array([1, 2, 3, 4, 5, 6])
|
||||||
|
X = CSR((data, indices, indptr), shape=(3, 3))
|
||||||
|
dtrain = DMatrixT(X)
|
||||||
|
assert dtrain.num_row() == 3
|
||||||
|
assert dtrain.num_col() == 3
|
||||||
|
assert dtrain.num_nonmissing() == data.size
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("error")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"DMatrixT,CSC",
|
||||||
|
[
|
||||||
|
(m, n)
|
||||||
|
for m, n in itertools.product(
|
||||||
|
(xgb.DMatrix, xgb.QuantileDMatrix),
|
||||||
|
(scipy.sparse.csc_matrix, scipy.sparse.csc_array),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_csc(DMatrixT: Type[xgb.DMatrix], CSC: Type) -> None:
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
row = np.array([0, 2, 2, 0, 1, 2])
|
||||||
|
col = np.array([0, 0, 1, 2, 2, 2])
|
||||||
|
data = np.array([1, 2, 3, 4, 5, 6])
|
||||||
|
X = CSC((data, (row, col)), shape=(3, 3))
|
||||||
|
dtrain = DMatrixT(X)
|
||||||
|
assert dtrain.num_row() == 3
|
||||||
|
assert dtrain.num_col() == 3
|
||||||
|
assert dtrain.num_nonmissing() == data.size
|
||||||
|
|
||||||
|
indptr = np.array([0, 3, 5])
|
||||||
|
data = np.array([0, 1, 2, 3, 4])
|
||||||
|
row_idx = np.array([0, 1, 2, 0, 2])
|
||||||
|
X = CSC((data, row_idx, indptr), shape=(3, 2))
|
||||||
|
assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("error")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"DMatrixT,COO",
|
||||||
|
[
|
||||||
|
(m, n)
|
||||||
|
for m, n in itertools.product(
|
||||||
|
(xgb.DMatrix, xgb.QuantileDMatrix),
|
||||||
|
(scipy.sparse.coo_matrix, scipy.sparse.coo_array),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_coo(DMatrixT: Type[xgb.DMatrix], COO: Type) -> None:
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
row = np.array([0, 2, 2, 0, 1, 2])
|
||||||
|
col = np.array([0, 0, 1, 2, 2, 2])
|
||||||
|
data = np.array([1, 2, 3, 4, 5, 6])
|
||||||
|
X = COO((data, (row, col)), shape=(3, 3))
|
||||||
|
dtrain = DMatrixT(X)
|
||||||
|
assert dtrain.num_row() == 3
|
||||||
|
assert dtrain.num_col() == 3
|
||||||
|
assert dtrain.num_nonmissing() == data.size
|
||||||
|
|
||||||
|
assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))
|
||||||
Loading…
x
Reference in New Issue
Block a user