Support half type for pandas. (#8481)

2022-11-24 12:47:40 +08:00 · 2022-11-24 12:47:40 +08:00 · 8f97c92541
commit 8f97c92541
parent e07245f110
5 changed files with 109 additions and 53 deletions
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -30,6 +30,7 @@ from .core import (
    c_array,
    c_str,
    from_pystr_to_cstr,
    make_jcargs,
 )
 DispatchedDataBackendReturnType = Tuple[
@ -184,24 +185,15 @@ def _from_numpy_array(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    """Initialize data from a 2-D numpy matrix.
+    """Initialize data from a 2-D numpy matrix."""
    """
    if len(data.shape) != 2:
-        raise ValueError(
+        raise ValueError("Expecting 2 dimensional numpy.ndarray, got: ", data.shape)
            "Expecting 2 dimensional numpy.ndarray, got: ", data.shape
        )
    data, _ = _ensure_np_dtype(data, data.dtype)
    handle = ctypes.c_void_p()
    args = {
        "missing": float(missing),
        "nthread": int(nthread),
    }
    config = bytes(json.dumps(args), "utf-8")
    _check_call(
        _LIB.XGDMatrixCreateFromDense(
            _array_interface(data),
-            config,
+            make_jcargs(missing=float(missing), nthread=int(nthread)),
            ctypes.byref(handle),
        )
    )
@ -1205,6 +1197,7 @@ def _proxy_transform(
        arr, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
        arr, _ = _ensure_np_dtype(arr, arr.dtype)
        return arr, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@ -0,0 +1,62 @@
 """Utilities for data generation."""
 from typing import Generator, Tuple
 import numpy as np
 def np_dtypes(
    n_samples: int, n_features: int
 ) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
    """Generate all supported dtypes from numpy."""
    import pandas as pd
    rng = np.random.RandomState(1994)
    # Integer and float.
    orig = rng.randint(low=0, high=127, size=n_samples * n_features).reshape(
        n_samples, n_features
    )
    dtypes = [
        np.int32,
        np.int64,
        np.byte,
        np.short,
        np.intc,
        np.int_,
        np.longlong,
        np.uint32,
        np.uint64,
        np.ubyte,
        np.ushort,
        np.uintc,
        np.uint,
        np.ulonglong,
        np.float16,
        np.float32,
        np.float64,
        np.half,
        np.single,
        np.double,
    ]
    for dtype in dtypes:
        X = np.array(orig, dtype=dtype)
        yield orig, X
    for dtype in dtypes:
        X = np.array(orig, dtype=dtype)
        df_orig = pd.DataFrame(orig)
        df = pd.DataFrame(X)
        yield df_orig, df
    # Boolean
    orig = rng.binomial(1, 0.5, size=n_samples * n_features).reshape(
        n_samples, n_features
    )
    for dtype in [np.bool_, np.bool8, bool]:
        X = np.array(orig, dtype=dtype)
        yield orig, X
    for dtype in [np.bool_, np.bool8, bool]:
        X = np.array(orig, dtype=dtype)
        df_orig = pd.DataFrame(orig)
        df = pd.DataFrame(X)
        yield df_orig, df
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@ -6,6 +6,7 @@ import pytest
 import scipy.sparse
 from hypothesis import given, settings, strategies
 from scipy.sparse import csr_matrix, rand
 from xgboost.testing.data import np_dtypes
 import xgboost as xgb
 from xgboost import testing as tm
@ -453,3 +454,15 @@ class TestDMatrix:
        np.testing.assert_equal(csr.indptr, ret.indptr)
        np.testing.assert_equal(csr.data, ret.data)
        np.testing.assert_equal(csr.indices, ret.indices)
    def test_dtypes(self) -> None:
        n_samples = 128
        n_features = 16
        for orig, x in np_dtypes(n_samples, n_features):
            m0 = xgb.DMatrix(orig)
            m1 = xgb.DMatrix(x)
            csr0 = m0.get_data()
            csr1 = m1.get_data()
            np.testing.assert_allclose(csr0.data, csr1.data)
            np.testing.assert_allclose(csr0.indptr, csr1.indptr)
            np.testing.assert_allclose(csr0.indices, csr1.indices)
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@ -5,6 +5,7 @@ import numpy as np
 import pandas as pd
 import pytest
 from scipy import sparse
 from xgboost.testing.data import np_dtypes
 from xgboost.testing.shared import validate_leaf_output
 import xgboost as xgb
@ -230,46 +231,10 @@ class TestInplacePredict:
        from_dmatrix = booster.predict(dtrain)
        np.testing.assert_allclose(from_dmatrix, from_inplace)
-    def test_dtypes(self):
+    def test_dtypes(self) -> None:
-        orig = self.rng.randint(low=0, high=127, size=self.rows * self.cols).reshape(
+        for orig, x in np_dtypes(self.rows, self.cols):
            self.rows, self.cols
        )
            predt_orig = self.booster.inplace_predict(orig)
-        # all primitive types in numpy
+            predt = self.booster.inplace_predict(x)
        for dtype in [
            np.int32,
            np.int64,
            np.byte,
            np.short,
            np.intc,
            np.int_,
            np.longlong,
            np.uint32,
            np.uint64,
            np.ubyte,
            np.ushort,
            np.uintc,
            np.uint,
            np.ulonglong,
            np.float16,
            np.float32,
            np.float64,
            np.half,
            np.single,
            np.double,
        ]:
            X = np.array(orig, dtype=dtype)
            predt = self.booster.inplace_predict(X)
            np.testing.assert_allclose(predt, predt_orig)
        # boolean
        orig = self.rng.binomial(1, 0.5, size=self.rows * self.cols).reshape(
            self.rows, self.cols
        )
        predt_orig = self.booster.inplace_predict(orig)
        for dtype in [np.bool8, np.bool_]:
            X = np.array(orig, dtype=dtype)
            predt = self.booster.inplace_predict(X)
            np.testing.assert_allclose(predt, predt_orig)
        # unsupported types
@ -278,6 +243,6 @@ class TestInplacePredict:
            np.complex64,
            np.complex128,
        ]:
-            X = np.array(orig, dtype=dtype)
+            X: np.ndarray = np.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                self.booster.inplace_predict(X)
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@ -11,6 +11,7 @@ from xgboost.testing import (
    make_categorical,
    make_sparse_regression,
 )
 from xgboost.testing.data import np_dtypes
 import xgboost as xgb
@ -238,3 +239,25 @@ class TestQuantileDMatrix:
        np.testing.assert_allclose(
            booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data()))
        )
    def test_dtypes(self) -> None:
        n_samples = 128
        n_features = 16
        for orig, x in np_dtypes(n_samples, n_features):
            m0 = xgb.QuantileDMatrix(orig)
            m1 = xgb.QuantileDMatrix(x)
            csr0 = m0.get_data()
            csr1 = m1.get_data()
            np.testing.assert_allclose(csr0.data, csr1.data)
            np.testing.assert_allclose(csr0.indptr, csr1.indptr)
            np.testing.assert_allclose(csr0.indices, csr1.indices)
        # unsupported types
        for dtype in [
            np.string_,
            np.complex64,
            np.complex128,
        ]:
            X: np.ndarray = np.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                xgb.QuantileDMatrix(X)