Support half type for pandas. (#8481)
This commit is contained in:
parent
e07245f110
commit
8f97c92541
@ -30,6 +30,7 @@ from .core import (
|
|||||||
c_array,
|
c_array,
|
||||||
c_str,
|
c_str,
|
||||||
from_pystr_to_cstr,
|
from_pystr_to_cstr,
|
||||||
|
make_jcargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
DispatchedDataBackendReturnType = Tuple[
|
DispatchedDataBackendReturnType = Tuple[
|
||||||
@ -184,24 +185,15 @@ def _from_numpy_array(
|
|||||||
feature_names: Optional[FeatureNames],
|
feature_names: Optional[FeatureNames],
|
||||||
feature_types: Optional[FeatureTypes],
|
feature_types: Optional[FeatureTypes],
|
||||||
) -> DispatchedDataBackendReturnType:
|
) -> DispatchedDataBackendReturnType:
|
||||||
"""Initialize data from a 2-D numpy matrix.
|
"""Initialize data from a 2-D numpy matrix."""
|
||||||
|
|
||||||
"""
|
|
||||||
if len(data.shape) != 2:
|
if len(data.shape) != 2:
|
||||||
raise ValueError(
|
raise ValueError("Expecting 2 dimensional numpy.ndarray, got: ", data.shape)
|
||||||
"Expecting 2 dimensional numpy.ndarray, got: ", data.shape
|
|
||||||
)
|
|
||||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||||
handle = ctypes.c_void_p()
|
handle = ctypes.c_void_p()
|
||||||
args = {
|
|
||||||
"missing": float(missing),
|
|
||||||
"nthread": int(nthread),
|
|
||||||
}
|
|
||||||
config = bytes(json.dumps(args), "utf-8")
|
|
||||||
_check_call(
|
_check_call(
|
||||||
_LIB.XGDMatrixCreateFromDense(
|
_LIB.XGDMatrixCreateFromDense(
|
||||||
_array_interface(data),
|
_array_interface(data),
|
||||||
config,
|
make_jcargs(missing=float(missing), nthread=int(nthread)),
|
||||||
ctypes.byref(handle),
|
ctypes.byref(handle),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -1205,6 +1197,7 @@ def _proxy_transform(
|
|||||||
arr, feature_names, feature_types = _transform_pandas_df(
|
arr, feature_names, feature_types = _transform_pandas_df(
|
||||||
data, enable_categorical, feature_names, feature_types
|
data, enable_categorical, feature_names, feature_types
|
||||||
)
|
)
|
||||||
|
arr, _ = _ensure_np_dtype(arr, arr.dtype)
|
||||||
return arr, None, feature_names, feature_types
|
return arr, None, feature_names, feature_types
|
||||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||||
|
|
||||||
|
|||||||
62
python-package/xgboost/testing/data.py
Normal file
62
python-package/xgboost/testing/data.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
"""Utilities for data generation."""
|
||||||
|
from typing import Generator, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def np_dtypes(
|
||||||
|
n_samples: int, n_features: int
|
||||||
|
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
|
||||||
|
"""Generate all supported dtypes from numpy."""
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
rng = np.random.RandomState(1994)
|
||||||
|
# Integer and float.
|
||||||
|
orig = rng.randint(low=0, high=127, size=n_samples * n_features).reshape(
|
||||||
|
n_samples, n_features
|
||||||
|
)
|
||||||
|
dtypes = [
|
||||||
|
np.int32,
|
||||||
|
np.int64,
|
||||||
|
np.byte,
|
||||||
|
np.short,
|
||||||
|
np.intc,
|
||||||
|
np.int_,
|
||||||
|
np.longlong,
|
||||||
|
np.uint32,
|
||||||
|
np.uint64,
|
||||||
|
np.ubyte,
|
||||||
|
np.ushort,
|
||||||
|
np.uintc,
|
||||||
|
np.uint,
|
||||||
|
np.ulonglong,
|
||||||
|
np.float16,
|
||||||
|
np.float32,
|
||||||
|
np.float64,
|
||||||
|
np.half,
|
||||||
|
np.single,
|
||||||
|
np.double,
|
||||||
|
]
|
||||||
|
for dtype in dtypes:
|
||||||
|
X = np.array(orig, dtype=dtype)
|
||||||
|
yield orig, X
|
||||||
|
|
||||||
|
for dtype in dtypes:
|
||||||
|
X = np.array(orig, dtype=dtype)
|
||||||
|
df_orig = pd.DataFrame(orig)
|
||||||
|
df = pd.DataFrame(X)
|
||||||
|
yield df_orig, df
|
||||||
|
|
||||||
|
# Boolean
|
||||||
|
orig = rng.binomial(1, 0.5, size=n_samples * n_features).reshape(
|
||||||
|
n_samples, n_features
|
||||||
|
)
|
||||||
|
for dtype in [np.bool_, np.bool8, bool]:
|
||||||
|
X = np.array(orig, dtype=dtype)
|
||||||
|
yield orig, X
|
||||||
|
|
||||||
|
for dtype in [np.bool_, np.bool8, bool]:
|
||||||
|
X = np.array(orig, dtype=dtype)
|
||||||
|
df_orig = pd.DataFrame(orig)
|
||||||
|
df = pd.DataFrame(X)
|
||||||
|
yield df_orig, df
|
||||||
@ -6,6 +6,7 @@ import pytest
|
|||||||
import scipy.sparse
|
import scipy.sparse
|
||||||
from hypothesis import given, settings, strategies
|
from hypothesis import given, settings, strategies
|
||||||
from scipy.sparse import csr_matrix, rand
|
from scipy.sparse import csr_matrix, rand
|
||||||
|
from xgboost.testing.data import np_dtypes
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost import testing as tm
|
from xgboost import testing as tm
|
||||||
@ -453,3 +454,15 @@ class TestDMatrix:
|
|||||||
np.testing.assert_equal(csr.indptr, ret.indptr)
|
np.testing.assert_equal(csr.indptr, ret.indptr)
|
||||||
np.testing.assert_equal(csr.data, ret.data)
|
np.testing.assert_equal(csr.data, ret.data)
|
||||||
np.testing.assert_equal(csr.indices, ret.indices)
|
np.testing.assert_equal(csr.indices, ret.indices)
|
||||||
|
|
||||||
|
def test_dtypes(self) -> None:
|
||||||
|
n_samples = 128
|
||||||
|
n_features = 16
|
||||||
|
for orig, x in np_dtypes(n_samples, n_features):
|
||||||
|
m0 = xgb.DMatrix(orig)
|
||||||
|
m1 = xgb.DMatrix(x)
|
||||||
|
csr0 = m0.get_data()
|
||||||
|
csr1 = m1.get_data()
|
||||||
|
np.testing.assert_allclose(csr0.data, csr1.data)
|
||||||
|
np.testing.assert_allclose(csr0.indptr, csr1.indptr)
|
||||||
|
np.testing.assert_allclose(csr0.indices, csr1.indices)
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytest
|
import pytest
|
||||||
from scipy import sparse
|
from scipy import sparse
|
||||||
|
from xgboost.testing.data import np_dtypes
|
||||||
from xgboost.testing.shared import validate_leaf_output
|
from xgboost.testing.shared import validate_leaf_output
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
@ -230,46 +231,10 @@ class TestInplacePredict:
|
|||||||
from_dmatrix = booster.predict(dtrain)
|
from_dmatrix = booster.predict(dtrain)
|
||||||
np.testing.assert_allclose(from_dmatrix, from_inplace)
|
np.testing.assert_allclose(from_dmatrix, from_inplace)
|
||||||
|
|
||||||
def test_dtypes(self):
|
def test_dtypes(self) -> None:
|
||||||
orig = self.rng.randint(low=0, high=127, size=self.rows * self.cols).reshape(
|
for orig, x in np_dtypes(self.rows, self.cols):
|
||||||
self.rows, self.cols
|
|
||||||
)
|
|
||||||
predt_orig = self.booster.inplace_predict(orig)
|
predt_orig = self.booster.inplace_predict(orig)
|
||||||
# all primitive types in numpy
|
predt = self.booster.inplace_predict(x)
|
||||||
for dtype in [
|
|
||||||
np.int32,
|
|
||||||
np.int64,
|
|
||||||
np.byte,
|
|
||||||
np.short,
|
|
||||||
np.intc,
|
|
||||||
np.int_,
|
|
||||||
np.longlong,
|
|
||||||
np.uint32,
|
|
||||||
np.uint64,
|
|
||||||
np.ubyte,
|
|
||||||
np.ushort,
|
|
||||||
np.uintc,
|
|
||||||
np.uint,
|
|
||||||
np.ulonglong,
|
|
||||||
np.float16,
|
|
||||||
np.float32,
|
|
||||||
np.float64,
|
|
||||||
np.half,
|
|
||||||
np.single,
|
|
||||||
np.double,
|
|
||||||
]:
|
|
||||||
X = np.array(orig, dtype=dtype)
|
|
||||||
predt = self.booster.inplace_predict(X)
|
|
||||||
np.testing.assert_allclose(predt, predt_orig)
|
|
||||||
|
|
||||||
# boolean
|
|
||||||
orig = self.rng.binomial(1, 0.5, size=self.rows * self.cols).reshape(
|
|
||||||
self.rows, self.cols
|
|
||||||
)
|
|
||||||
predt_orig = self.booster.inplace_predict(orig)
|
|
||||||
for dtype in [np.bool8, np.bool_]:
|
|
||||||
X = np.array(orig, dtype=dtype)
|
|
||||||
predt = self.booster.inplace_predict(X)
|
|
||||||
np.testing.assert_allclose(predt, predt_orig)
|
np.testing.assert_allclose(predt, predt_orig)
|
||||||
|
|
||||||
# unsupported types
|
# unsupported types
|
||||||
@ -278,6 +243,6 @@ class TestInplacePredict:
|
|||||||
np.complex64,
|
np.complex64,
|
||||||
np.complex128,
|
np.complex128,
|
||||||
]:
|
]:
|
||||||
X = np.array(orig, dtype=dtype)
|
X: np.ndarray = np.array(orig, dtype=dtype)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
self.booster.inplace_predict(X)
|
self.booster.inplace_predict(X)
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from xgboost.testing import (
|
|||||||
make_categorical,
|
make_categorical,
|
||||||
make_sparse_regression,
|
make_sparse_regression,
|
||||||
)
|
)
|
||||||
|
from xgboost.testing.data import np_dtypes
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
|
||||||
@ -238,3 +239,25 @@ class TestQuantileDMatrix:
|
|||||||
np.testing.assert_allclose(
|
np.testing.assert_allclose(
|
||||||
booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data()))
|
booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data()))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_dtypes(self) -> None:
|
||||||
|
n_samples = 128
|
||||||
|
n_features = 16
|
||||||
|
for orig, x in np_dtypes(n_samples, n_features):
|
||||||
|
m0 = xgb.QuantileDMatrix(orig)
|
||||||
|
m1 = xgb.QuantileDMatrix(x)
|
||||||
|
csr0 = m0.get_data()
|
||||||
|
csr1 = m1.get_data()
|
||||||
|
np.testing.assert_allclose(csr0.data, csr1.data)
|
||||||
|
np.testing.assert_allclose(csr0.indptr, csr1.indptr)
|
||||||
|
np.testing.assert_allclose(csr0.indices, csr1.indices)
|
||||||
|
|
||||||
|
# unsupported types
|
||||||
|
for dtype in [
|
||||||
|
np.string_,
|
||||||
|
np.complex64,
|
||||||
|
np.complex128,
|
||||||
|
]:
|
||||||
|
X: np.ndarray = np.array(orig, dtype=dtype)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.QuantileDMatrix(X)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user