From 8f97c9254162acba743ef68c062269ba23555908 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 24 Nov 2022 12:47:40 +0800 Subject: [PATCH] Support half type for pandas. (#8481) --- python-package/xgboost/data.py | 17 +++---- python-package/xgboost/testing/data.py | 62 ++++++++++++++++++++++++++ tests/python/test_dmatrix.py | 13 ++++++ tests/python/test_predict.py | 47 +++---------------- tests/python/test_quantile_dmatrix.py | 23 ++++++++++ 5 files changed, 109 insertions(+), 53 deletions(-) create mode 100644 python-package/xgboost/testing/data.py diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 1072c82e6..f126af52b 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -30,6 +30,7 @@ from .core import ( c_array, c_str, from_pystr_to_cstr, + make_jcargs, ) DispatchedDataBackendReturnType = Tuple[ @@ -184,24 +185,15 @@ def _from_numpy_array( feature_names: Optional[FeatureNames], feature_types: Optional[FeatureTypes], ) -> DispatchedDataBackendReturnType: - """Initialize data from a 2-D numpy matrix. - - """ + """Initialize data from a 2-D numpy matrix.""" if len(data.shape) != 2: - raise ValueError( - "Expecting 2 dimensional numpy.ndarray, got: ", data.shape - ) + raise ValueError("Expecting 2 dimensional numpy.ndarray, got: ", data.shape) data, _ = _ensure_np_dtype(data, data.dtype) handle = ctypes.c_void_p() - args = { - "missing": float(missing), - "nthread": int(nthread), - } - config = bytes(json.dumps(args), "utf-8") _check_call( _LIB.XGDMatrixCreateFromDense( _array_interface(data), - config, + make_jcargs(missing=float(missing), nthread=int(nthread)), ctypes.byref(handle), ) ) @@ -1205,6 +1197,7 @@ def _proxy_transform( arr, feature_names, feature_types = _transform_pandas_df( data, enable_categorical, feature_names, feature_types ) + arr, _ = _ensure_np_dtype(arr, arr.dtype) return arr, None, feature_names, feature_types raise TypeError("Value type is not supported for data iterator:" + str(type(data))) diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py new file mode 100644 index 000000000..5dc032074 --- /dev/null +++ b/python-package/xgboost/testing/data.py @@ -0,0 +1,62 @@ +"""Utilities for data generation.""" +from typing import Generator, Tuple + +import numpy as np + + +def np_dtypes( + n_samples: int, n_features: int +) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]: + """Generate all supported dtypes from numpy.""" + import pandas as pd + + rng = np.random.RandomState(1994) + # Integer and float. + orig = rng.randint(low=0, high=127, size=n_samples * n_features).reshape( + n_samples, n_features + ) + dtypes = [ + np.int32, + np.int64, + np.byte, + np.short, + np.intc, + np.int_, + np.longlong, + np.uint32, + np.uint64, + np.ubyte, + np.ushort, + np.uintc, + np.uint, + np.ulonglong, + np.float16, + np.float32, + np.float64, + np.half, + np.single, + np.double, + ] + for dtype in dtypes: + X = np.array(orig, dtype=dtype) + yield orig, X + + for dtype in dtypes: + X = np.array(orig, dtype=dtype) + df_orig = pd.DataFrame(orig) + df = pd.DataFrame(X) + yield df_orig, df + + # Boolean + orig = rng.binomial(1, 0.5, size=n_samples * n_features).reshape( + n_samples, n_features + ) + for dtype in [np.bool_, np.bool8, bool]: + X = np.array(orig, dtype=dtype) + yield orig, X + + for dtype in [np.bool_, np.bool8, bool]: + X = np.array(orig, dtype=dtype) + df_orig = pd.DataFrame(orig) + df = pd.DataFrame(X) + yield df_orig, df diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index def369027..f192f813e 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -6,6 +6,7 @@ import pytest import scipy.sparse from hypothesis import given, settings, strategies from scipy.sparse import csr_matrix, rand +from xgboost.testing.data import np_dtypes import xgboost as xgb from xgboost import testing as tm @@ -453,3 +454,15 @@ class TestDMatrix: np.testing.assert_equal(csr.indptr, ret.indptr) np.testing.assert_equal(csr.data, ret.data) np.testing.assert_equal(csr.indices, ret.indices) + + def test_dtypes(self) -> None: + n_samples = 128 + n_features = 16 + for orig, x in np_dtypes(n_samples, n_features): + m0 = xgb.DMatrix(orig) + m1 = xgb.DMatrix(x) + csr0 = m0.get_data() + csr1 = m1.get_data() + np.testing.assert_allclose(csr0.data, csr1.data) + np.testing.assert_allclose(csr0.indptr, csr1.indptr) + np.testing.assert_allclose(csr0.indices, csr1.indices) diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py index 797750fde..787188b11 100644 --- a/tests/python/test_predict.py +++ b/tests/python/test_predict.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest from scipy import sparse +from xgboost.testing.data import np_dtypes from xgboost.testing.shared import validate_leaf_output import xgboost as xgb @@ -230,46 +231,10 @@ class TestInplacePredict: from_dmatrix = booster.predict(dtrain) np.testing.assert_allclose(from_dmatrix, from_inplace) - def test_dtypes(self): - orig = self.rng.randint(low=0, high=127, size=self.rows * self.cols).reshape( - self.rows, self.cols - ) - predt_orig = self.booster.inplace_predict(orig) - # all primitive types in numpy - for dtype in [ - np.int32, - np.int64, - np.byte, - np.short, - np.intc, - np.int_, - np.longlong, - np.uint32, - np.uint64, - np.ubyte, - np.ushort, - np.uintc, - np.uint, - np.ulonglong, - np.float16, - np.float32, - np.float64, - np.half, - np.single, - np.double, - ]: - X = np.array(orig, dtype=dtype) - predt = self.booster.inplace_predict(X) - np.testing.assert_allclose(predt, predt_orig) - - # boolean - orig = self.rng.binomial(1, 0.5, size=self.rows * self.cols).reshape( - self.rows, self.cols - ) - predt_orig = self.booster.inplace_predict(orig) - for dtype in [np.bool8, np.bool_]: - X = np.array(orig, dtype=dtype) - predt = self.booster.inplace_predict(X) + def test_dtypes(self) -> None: + for orig, x in np_dtypes(self.rows, self.cols): + predt_orig = self.booster.inplace_predict(orig) + predt = self.booster.inplace_predict(x) np.testing.assert_allclose(predt, predt_orig) # unsupported types @@ -278,6 +243,6 @@ class TestInplacePredict: np.complex64, np.complex128, ]: - X = np.array(orig, dtype=dtype) + X: np.ndarray = np.array(orig, dtype=dtype) with pytest.raises(ValueError): self.booster.inplace_predict(X) diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py index 56b2a7d90..e62137e36 100644 --- a/tests/python/test_quantile_dmatrix.py +++ b/tests/python/test_quantile_dmatrix.py @@ -11,6 +11,7 @@ from xgboost.testing import ( make_categorical, make_sparse_regression, ) +from xgboost.testing.data import np_dtypes import xgboost as xgb @@ -238,3 +239,25 @@ class TestQuantileDMatrix: np.testing.assert_allclose( booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data())) ) + + def test_dtypes(self) -> None: + n_samples = 128 + n_features = 16 + for orig, x in np_dtypes(n_samples, n_features): + m0 = xgb.QuantileDMatrix(orig) + m1 = xgb.QuantileDMatrix(x) + csr0 = m0.get_data() + csr1 = m1.get_data() + np.testing.assert_allclose(csr0.data, csr1.data) + np.testing.assert_allclose(csr0.indptr, csr1.indptr) + np.testing.assert_allclose(csr0.indices, csr1.indices) + + # unsupported types + for dtype in [ + np.string_, + np.complex64, + np.complex128, + ]: + X: np.ndarray = np.array(orig, dtype=dtype) + with pytest.raises(ValueError): + xgb.QuantileDMatrix(X)