Support half type for pandas. (#8481)

2022-11-24 12:47:40 +08:00
parent e07245f110
commit 8f97c92541
5 changed files with 109 additions and 53 deletions
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -30,6 +30,7 @@ from .core import (
    c_array,
    c_str,
    from_pystr_to_cstr,
+    make_jcargs,
 )

 DispatchedDataBackendReturnType = Tuple[
@@ -184,24 +185,15 @@ def _from_numpy_array(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    """Initialize data from a 2-D numpy matrix.
-
-    """
+    """Initialize data from a 2-D numpy matrix."""
    if len(data.shape) != 2:
-        raise ValueError(
-            "Expecting 2 dimensional numpy.ndarray, got: ", data.shape
-        )
+        raise ValueError("Expecting 2 dimensional numpy.ndarray, got: ", data.shape)
    data, _ = _ensure_np_dtype(data, data.dtype)
    handle = ctypes.c_void_p()
-    args = {
-        "missing": float(missing),
-        "nthread": int(nthread),
-    }
-    config = bytes(json.dumps(args), "utf-8")
    _check_call(
        _LIB.XGDMatrixCreateFromDense(
            _array_interface(data),
-            config,
+            make_jcargs(missing=float(missing), nthread=int(nthread)),
            ctypes.byref(handle),
        )
    )
@@ -1205,6 +1197,7 @@ def _proxy_transform(
        arr, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
+        arr, _ = _ensure_np_dtype(arr, arr.dtype)
        return arr, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))

--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -0,0 +1,62 @@
+"""Utilities for data generation."""
+from typing import Generator, Tuple
+
+import numpy as np
+
+
+def np_dtypes(
+    n_samples: int, n_features: int
+) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
+    """Generate all supported dtypes from numpy."""
+    import pandas as pd
+
+    rng = np.random.RandomState(1994)
+    # Integer and float.
+    orig = rng.randint(low=0, high=127, size=n_samples * n_features).reshape(
+        n_samples, n_features
+    )
+    dtypes = [
+        np.int32,
+        np.int64,
+        np.byte,
+        np.short,
+        np.intc,
+        np.int_,
+        np.longlong,
+        np.uint32,
+        np.uint64,
+        np.ubyte,
+        np.ushort,
+        np.uintc,
+        np.uint,
+        np.ulonglong,
+        np.float16,
+        np.float32,
+        np.float64,
+        np.half,
+        np.single,
+        np.double,
+    ]
+    for dtype in dtypes:
+        X = np.array(orig, dtype=dtype)
+        yield orig, X
+
+    for dtype in dtypes:
+        X = np.array(orig, dtype=dtype)
+        df_orig = pd.DataFrame(orig)
+        df = pd.DataFrame(X)
+        yield df_orig, df
+
+    # Boolean
+    orig = rng.binomial(1, 0.5, size=n_samples * n_features).reshape(
+        n_samples, n_features
+    )
+    for dtype in [np.bool_, np.bool8, bool]:
+        X = np.array(orig, dtype=dtype)
+        yield orig, X
+
+    for dtype in [np.bool_, np.bool8, bool]:
+        X = np.array(orig, dtype=dtype)
+        df_orig = pd.DataFrame(orig)
+        df = pd.DataFrame(X)
+        yield df_orig, df