diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index cd542ba70..011afbb9e 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2174,6 +2174,7 @@ class Booster: ) return _prediction_output(shape, dims, preds, False) + # pylint: disable=too-many-statements def inplace_predict( self, data: DataType, @@ -2194,10 +2195,10 @@ class Booster: .. code-block:: python - booster.set_param({'predictor': 'gpu_predictor'}) + booster.set_param({"predictor": "gpu_predictor"}) booster.inplace_predict(cupy_array) - booster.set_param({'predictor': 'cpu_predictor}) + booster.set_param({"predictor": "cpu_predictor"}) booster.inplace_predict(numpy_array) .. versionadded:: 1.1.0 @@ -2310,14 +2311,16 @@ class Booster: ) return _prediction_output(shape, dims, preds, False) if isinstance(data, scipy.sparse.csr_matrix): - csr = data + from .data import _transform_scipy_csr + + data = _transform_scipy_csr(data) _check_call( _LIB.XGBoosterPredictFromCSR( self.handle, - _array_interface(csr.indptr), - _array_interface(csr.indices), - _array_interface(csr.data), - c_bst_ulong(csr.shape[1]), + _array_interface(data.indptr), + _array_interface(data.indices), + _array_interface(data.data), + c_bst_ulong(data.shape[1]), args, p_handle, ctypes.byref(shape), diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 026b1c6ea..c55aecc2b 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -84,6 +84,21 @@ def _array_interface(data: np.ndarray) -> bytes: return interface_str +def _transform_scipy_csr(data: DataType) -> DataType: + from scipy.sparse import csr_matrix + + indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype) + indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype) + values, _ = _ensure_np_dtype(data.data, data.data.dtype) + if ( + indptr is not data.indptr + or indices is not data.indices + or values is not data.data + ): + data = csr_matrix((values, indices, indptr), shape=data.shape) + return data + + def _from_scipy_csr( data: DataType, missing: FloatCompatible, @@ -97,18 +112,14 @@ def _from_scipy_csr( f"length mismatch: {len(data.indices)} vs {len(data.data)}" ) handle = ctypes.c_void_p() - args = { - "missing": float(missing), - "nthread": int(nthread), - } - config = bytes(json.dumps(args), "utf-8") + data = _transform_scipy_csr(data) _check_call( _LIB.XGDMatrixCreateFromCSR( _array_interface(data.indptr), _array_interface(data.indices), _array_interface(data.data), c_bst_ulong(data.shape[1]), - config, + make_jcargs(missing=float(missing), nthread=int(nthread)), ctypes.byref(handle), ) ) @@ -157,12 +168,13 @@ def _is_numpy_array(data: DataType) -> bool: def _ensure_np_dtype( - data: DataType, - dtype: Optional[NumpyDType] + data: DataType, dtype: Optional[NumpyDType] ) -> Tuple[np.ndarray, Optional[NumpyDType]]: if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]: - data = data.astype(np.float32, copy=False) dtype = np.float32 + data = data.astype(dtype, copy=False) + if not data.flags.aligned: + data = np.require(data, requirements="A") return data, dtype @@ -1213,6 +1225,7 @@ def _proxy_transform( data, _ = _ensure_np_dtype(data, data.dtype) return data, None, feature_names, feature_types if _is_scipy_csr(data): + data = _transform_scipy_csr(data) return data, None, feature_names, feature_types if _is_pandas_series(data): import pandas as pd diff --git a/src/data/array_interface.h b/src/data/array_interface.h index d8aa504df..8a4661712 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -1,5 +1,5 @@ -/*! - * Copyright 2019-2021 by Contributors +/** + * Copyright 2019-2023 by XGBoost Contributors * \file array_interface.h * \brief View of __array_interface__ */ @@ -7,9 +7,10 @@ #define XGBOOST_DATA_ARRAY_INTERFACE_H_ #include -#include +#include #include #include +#include // std::alignment_of #include #include @@ -400,6 +401,13 @@ class ArrayInterface { data = ArrayInterfaceHandler::ExtractData(array, n); static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported."); + + this->DispatchCall([&](auto const *data_typed_ptr) { + auto ptr = reinterpret_cast(data); + auto alignment = std::alignment_of>::value; + CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment."; + }); + if (allow_mask) { common::Span s_mask; size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask); diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc index c36b46b63..72e5ccc10 100644 --- a/tests/cpp/data/test_array_interface.cc +++ b/tests/cpp/data/test_array_interface.cc @@ -1,10 +1,12 @@ -/*! - * Copyright 2020-2021 by XGBoost Contributors +/** + * Copyright 2020-2023 by XGBoost Contributors */ #include #include #include "../helpers.h" #include "../../../src/data/array_interface.h" +#include "dmlc/logging.h" +#include "xgboost/json.h" namespace xgboost { TEST(ArrayInterface, Initialize) { @@ -71,6 +73,14 @@ TEST(ArrayInterface, Error) { column["mask"]["data"] = Null{}; common::Span s_mask; EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error); + + get(column).erase("mask"); + // misaligned. + j_data = {Json(Integer(reinterpret_cast( + reinterpret_cast(storage.ConstHostPointer()) + 1))), + Json(Boolean(false))}; + column["data"] = j_data; + EXPECT_THROW({ ArrayInterface<1> arr{column}; }, dmlc::Error); } TEST(ArrayInterface, GetElement) { diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 3fcc62967..ed557da32 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -327,7 +327,7 @@ class TestDMatrix: nrow = 100 ncol = 1000 x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) - assert x.indices.max() < ncol - 1 + assert x.indices.max() < ncol x.data[:] = 1 dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)