From 37ad60fe2574d073d534ab408c3eac5779107bfe Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sun, 2 May 2021 00:09:01 +0800 Subject: [PATCH] Enforce input data is not `object`. (#6927) * Check for object data type. * Allow strided arrays with greater underlying buffer size. --- python-package/xgboost/core.py | 7 +++++-- python-package/xgboost/data.py | 8 ++++++++ src/data/array_interface.h | 7 +++++-- tests/python/test_predict.py | 13 +++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index ae1595b2e..227f0bdee 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -233,6 +233,9 @@ def _numpy2ctypes_type(dtype): def _array_interface(data: np.ndarray) -> bytes: + assert ( + data.dtype.hasobject is False + ), "Input data contains `object` dtype. Expecting numeric data." interface = data.__array_interface__ if "mask" in interface: interface["mask"] = interface["mask"].__array_interface__ @@ -1908,8 +1911,8 @@ class Booster(object): ) if isinstance(data, np.ndarray): - from .data import _maybe_np_slice - data = _maybe_np_slice(data, data.dtype) + from .data import _ensure_np_dtype + data, _ = _ensure_np_dtype(data, data.dtype) _check_call( _LIB.XGBoosterPredictFromDense( self.handle, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index c8206e7ff..002f801ef 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -104,6 +104,13 @@ def _is_numpy_array(data): return isinstance(data, (np.ndarray, np.matrix)) +def _ensure_np_dtype(data, dtype): + if data.dtype.hasobject: + data = data.astype(np.float32, copy=False) + dtype = np.float32 + return data, dtype + + def _maybe_np_slice(data, dtype): '''Handle numpy slice. This can be removed if we use __array_interface__. ''' @@ -118,6 +125,7 @@ def _maybe_np_slice(data, dtype): data = np.array(data, copy=False, dtype=dtype) except AttributeError: data = np.array(data, copy=False, dtype=dtype) + data, dtype = _ensure_np_dtype(data, dtype) return data diff --git a/src/data/array_interface.h b/src/data/array_interface.h index 71db92b6a..9d87f316d 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -229,8 +229,11 @@ class ArrayInterfaceHandler { } strides[1] = n; } - auto valid = (rows - 1) * strides[0] + (cols - 1) * strides[1] == (rows * cols) - 1; - CHECK(valid) << "Invalid strides in array."; + + auto valid = rows * strides[0] + cols * strides[1] >= (rows * cols); + CHECK(valid) << "Invalid strides in array." + << " strides: (" << strides[0] << "," << strides[1] + << "), shape: (" << rows << ", " << cols << ")"; } static void* ExtractData(std::map const &column, diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py index d451cd831..3ad436a2b 100644 --- a/tests/python/test_predict.py +++ b/tests/python/test_predict.py @@ -155,6 +155,14 @@ class TestInplacePredict: predt_from_array = booster.inplace_predict(X[:10, ...], missing=self.missing) predt_from_dmatrix = booster.predict(test) + X_obj = X.copy().astype(object) + + assert X_obj.dtype.hasobject is True + assert X.dtype.hasobject is False + np.testing.assert_allclose( + booster.inplace_predict(X_obj), booster.inplace_predict(X) + ) + np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) predt_from_array = booster.inplace_predict( @@ -192,8 +200,13 @@ class TestInplacePredict: arr_predt = booster.inplace_predict(X) dmat_predt = booster.predict(xgb.DMatrix(X)) + X = df.values + X = np.asfortranarray(X) + fort_predt = booster.inplace_predict(X) + np.testing.assert_allclose(dmat_predt, arr_predt) np.testing.assert_allclose(df_predt, arr_predt) + np.testing.assert_allclose(fort_predt, arr_predt) def test_base_margin(self): booster = self.booster