Support all pandas nullable integer types. (#8480)
- Enumerate all pandas integer types. - Tests for `None`, `nan`, and `pd.NA`
This commit is contained in:
parent
f2209c1fe4
commit
d666ba775e
@ -1068,7 +1068,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
|||||||
return ret.value
|
return ret.value
|
||||||
|
|
||||||
def num_nonmissing(self) -> int:
|
def num_nonmissing(self) -> int:
|
||||||
"""Get the number of non-missing values in the DMatrix."""
|
"""Get the number of non-missing values in the DMatrix.
|
||||||
|
|
||||||
|
.. versionadded:: 1.7.0
|
||||||
|
|
||||||
|
"""
|
||||||
ret = c_bst_ulong()
|
ret = c_bst_ulong()
|
||||||
_check_call(_LIB.XGDMatrixNumNonMissing(self.handle, ctypes.byref(ret)))
|
_check_call(_LIB.XGDMatrixNumNonMissing(self.handle, ctypes.byref(ret)))
|
||||||
return ret.value
|
return ret.value
|
||||||
|
|||||||
@ -34,7 +34,8 @@ from .core import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
DispatchedDataBackendReturnType = Tuple[
|
DispatchedDataBackendReturnType = Tuple[
|
||||||
ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]]
|
ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]
|
||||||
|
]
|
||||||
|
|
||||||
CAT_T = "c"
|
CAT_T = "c"
|
||||||
|
|
||||||
@ -217,27 +218,36 @@ def _is_modin_df(data: DataType) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
_pandas_dtype_mapper = {
|
_pandas_dtype_mapper = {
|
||||||
'int8': 'int',
|
"int8": "int",
|
||||||
'int16': 'int',
|
"int16": "int",
|
||||||
'int32': 'int',
|
"int32": "int",
|
||||||
'int64': 'int',
|
"int64": "int",
|
||||||
'uint8': 'int',
|
"uint8": "int",
|
||||||
'uint16': 'int',
|
"uint16": "int",
|
||||||
'uint32': 'int',
|
"uint32": "int",
|
||||||
'uint64': 'int',
|
"uint64": "int",
|
||||||
'float16': 'float',
|
"float16": "float",
|
||||||
'float32': 'float',
|
"float32": "float",
|
||||||
'float64': 'float',
|
"float64": "float",
|
||||||
'bool': 'i',
|
"bool": "i",
|
||||||
# nullable types
|
}
|
||||||
|
|
||||||
|
# nullable types
|
||||||
|
pandas_nullable_mapper = {
|
||||||
|
"Int8": "int",
|
||||||
"Int16": "int",
|
"Int16": "int",
|
||||||
"Int32": "int",
|
"Int32": "int",
|
||||||
"Int64": "int",
|
"Int64": "int",
|
||||||
|
"UInt8": "i",
|
||||||
|
"UInt16": "i",
|
||||||
|
"UInt32": "i",
|
||||||
|
"UInt64": "i",
|
||||||
"Float32": "float",
|
"Float32": "float",
|
||||||
"Float64": "float",
|
"Float64": "float",
|
||||||
"boolean": "i",
|
"boolean": "i",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_pandas_dtype_mapper.update(pandas_nullable_mapper)
|
||||||
|
|
||||||
_ENABLE_CAT_ERR = (
|
_ENABLE_CAT_ERR = (
|
||||||
"When categorical type is supplied, The experimental DMatrix parameter"
|
"When categorical type is supplied, The experimental DMatrix parameter"
|
||||||
@ -304,27 +314,27 @@ def is_nullable_dtype(dtype: PandasDType) -> bool:
|
|||||||
is_integer_dtype,
|
is_integer_dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
# dtype: pd.core.arrays.numeric.NumericDtype
|
is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
|
||||||
nullable_alias = {"Int16", "Int32", "Int64", "Float32", "Float64", "category"}
|
|
||||||
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
|
|
||||||
# np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`.
|
# np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`.
|
||||||
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
|
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
|
||||||
is_float = is_float_dtype(dtype) and dtype.name in nullable_alias
|
is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
|
||||||
return is_int or is_bool or is_float or is_categorical_dtype(dtype)
|
return is_int or is_bool or is_float or is_categorical_dtype(dtype)
|
||||||
|
|
||||||
|
|
||||||
def _pandas_cat_null(data: DataFrame) -> DataFrame:
|
def pandas_cat_null(data: DataFrame) -> DataFrame:
|
||||||
|
"""Handle categorical dtype and nullable extension types from pandas."""
|
||||||
from pandas.api.types import is_categorical_dtype
|
from pandas.api.types import is_categorical_dtype
|
||||||
|
|
||||||
# handle category codes and nullable.
|
# handle category codes and nullable.
|
||||||
cat_columns = [
|
cat_columns = []
|
||||||
col
|
nul_columns = []
|
||||||
for col, dtype in zip(data.columns, data.dtypes)
|
for col, dtype in zip(data.columns, data.dtypes):
|
||||||
if is_categorical_dtype(dtype)
|
if is_categorical_dtype(dtype):
|
||||||
]
|
cat_columns.append(col)
|
||||||
nul_columns = [
|
# avoid an unnecessary conversion if possible
|
||||||
col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
|
elif is_nullable_dtype(dtype):
|
||||||
]
|
nul_columns.append(col)
|
||||||
|
|
||||||
if cat_columns or nul_columns:
|
if cat_columns or nul_columns:
|
||||||
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
|
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
|
||||||
# fragmented
|
# fragmented
|
||||||
@ -333,7 +343,7 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame:
|
|||||||
transformed = data
|
transformed = data
|
||||||
|
|
||||||
if cat_columns:
|
if cat_columns:
|
||||||
# DF doesn't have the cat attribute, so we use apply here
|
# DF doesn't have the cat attribute, as a result, we use apply here
|
||||||
transformed[cat_columns] = (
|
transformed[cat_columns] = (
|
||||||
transformed[cat_columns]
|
transformed[cat_columns]
|
||||||
.apply(lambda x: x.cat.codes)
|
.apply(lambda x: x.cat.codes)
|
||||||
@ -343,6 +353,10 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame:
|
|||||||
if nul_columns:
|
if nul_columns:
|
||||||
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
|
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
|
||||||
|
|
||||||
|
# TODO(jiamingy): Investigate the possibility of using dataframe protocol or arrow
|
||||||
|
# IPC format for pandas so that we can apply the data transformation inside XGBoost
|
||||||
|
# for better memory efficiency.
|
||||||
|
|
||||||
return transformed
|
return transformed
|
||||||
|
|
||||||
|
|
||||||
@ -357,9 +371,8 @@ def _transform_pandas_df(
|
|||||||
from pandas.api.types import is_categorical_dtype, is_sparse
|
from pandas.api.types import is_categorical_dtype, is_sparse
|
||||||
|
|
||||||
if not all(
|
if not all(
|
||||||
dtype.name in _pandas_dtype_mapper
|
(dtype.name in _pandas_dtype_mapper)
|
||||||
or is_sparse(dtype)
|
or is_sparse(dtype)
|
||||||
or (is_nullable_dtype(dtype) and not is_categorical_dtype(dtype))
|
|
||||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||||
for dtype in data.dtypes
|
for dtype in data.dtypes
|
||||||
):
|
):
|
||||||
@ -369,7 +382,7 @@ def _transform_pandas_df(
|
|||||||
data, meta, feature_names, feature_types, enable_categorical
|
data, meta, feature_names, feature_types, enable_categorical
|
||||||
)
|
)
|
||||||
|
|
||||||
transformed = _pandas_cat_null(data)
|
transformed = pandas_cat_null(data)
|
||||||
|
|
||||||
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
|
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
|
||||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||||
@ -404,14 +417,12 @@ def _is_pandas_series(data: DataType) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def _meta_from_pandas_series(
|
def _meta_from_pandas_series(
|
||||||
data: DataType,
|
data: DataType, name: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
|
||||||
name: str,
|
|
||||||
dtype: Optional[NumpyDType],
|
|
||||||
handle: ctypes.c_void_p
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Help transform pandas series for meta data like labels"""
|
"""Help transform pandas series for meta data like labels"""
|
||||||
data = data.values.astype('float')
|
data = data.values.astype("float")
|
||||||
from pandas.api.types import is_sparse
|
from pandas.api.types import is_sparse
|
||||||
|
|
||||||
if is_sparse(data):
|
if is_sparse(data):
|
||||||
data = data.to_dense() # type: ignore
|
data = data.to_dense() # type: ignore
|
||||||
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
|
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
|
||||||
|
|||||||
@ -773,6 +773,19 @@ def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
|
|||||||
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
||||||
|
|
||||||
|
|
||||||
|
def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
|
||||||
|
"""Assert whether two DMatrices contain the same predictors."""
|
||||||
|
lcsr = lhs.get_data()
|
||||||
|
rcsr = rhs.get_data()
|
||||||
|
return all(
|
||||||
|
(
|
||||||
|
np.array_equal(lcsr.data, rcsr.data),
|
||||||
|
np.array_equal(lcsr.indices, rcsr.indices),
|
||||||
|
np.array_equal(lcsr.indptr, rcsr.indptr),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
|
def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
|
||||||
"""Evaluation metric for xgb.train"""
|
"""Evaluation metric for xgb.train"""
|
||||||
label = dtrain.get_label()
|
label = dtrain.get_label()
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
"""Utilities for data generation."""
|
"""Utilities for data generation."""
|
||||||
from typing import Generator, Tuple
|
from typing import Any, Generator, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@ -7,7 +7,7 @@ import numpy as np
|
|||||||
def np_dtypes(
|
def np_dtypes(
|
||||||
n_samples: int, n_features: int
|
n_samples: int, n_features: int
|
||||||
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
|
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
|
||||||
"""Generate all supported dtypes from numpy."""
|
"""Enumerate all supported dtypes from numpy."""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
@ -60,3 +60,61 @@ def np_dtypes(
|
|||||||
df_orig = pd.DataFrame(orig)
|
df_orig = pd.DataFrame(orig)
|
||||||
df = pd.DataFrame(X)
|
df = pd.DataFrame(X)
|
||||||
yield df_orig, df
|
yield df_orig, df
|
||||||
|
|
||||||
|
|
||||||
|
def pd_dtypes() -> Generator:
|
||||||
|
"""Enumerate all supported pandas extension types."""
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Integer
|
||||||
|
dtypes = [
|
||||||
|
pd.UInt8Dtype(),
|
||||||
|
pd.UInt16Dtype(),
|
||||||
|
pd.UInt32Dtype(),
|
||||||
|
pd.UInt64Dtype(),
|
||||||
|
pd.Int8Dtype(),
|
||||||
|
pd.Int16Dtype(),
|
||||||
|
pd.Int32Dtype(),
|
||||||
|
pd.Int64Dtype(),
|
||||||
|
]
|
||||||
|
|
||||||
|
Null: Union[float, None, Any] = np.nan
|
||||||
|
orig = pd.DataFrame(
|
||||||
|
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=np.float32
|
||||||
|
)
|
||||||
|
for Null in (np.nan, None, pd.NA):
|
||||||
|
for dtype in dtypes:
|
||||||
|
df = pd.DataFrame(
|
||||||
|
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=dtype
|
||||||
|
)
|
||||||
|
yield orig, df
|
||||||
|
|
||||||
|
# Float
|
||||||
|
Null = np.nan
|
||||||
|
dtypes = [pd.Float32Dtype(), pd.Float64Dtype()]
|
||||||
|
orig = pd.DataFrame(
|
||||||
|
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=np.float32
|
||||||
|
)
|
||||||
|
for Null in (np.nan, None, pd.NA):
|
||||||
|
for dtype in dtypes:
|
||||||
|
df = pd.DataFrame(
|
||||||
|
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=dtype
|
||||||
|
)
|
||||||
|
yield orig, df
|
||||||
|
|
||||||
|
# Categorical
|
||||||
|
orig = orig.astype("category")
|
||||||
|
for Null in (np.nan, None, pd.NA):
|
||||||
|
df = pd.DataFrame(
|
||||||
|
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]},
|
||||||
|
dtype=pd.CategoricalDtype(),
|
||||||
|
)
|
||||||
|
yield orig, df
|
||||||
|
|
||||||
|
# Boolean
|
||||||
|
for Null in [None, pd.NA]:
|
||||||
|
data = {"f0": [True, False, Null, True], "f1": [False, True, Null, True]}
|
||||||
|
# pd.NA is not convertible to bool.
|
||||||
|
orig = pd.DataFrame(data, dtype=np.bool_ if Null is None else pd.BooleanDtype())
|
||||||
|
df = pd.DataFrame(data, dtype=pd.BooleanDtype())
|
||||||
|
yield orig, df
|
||||||
|
|||||||
@ -461,8 +461,4 @@ class TestDMatrix:
|
|||||||
for orig, x in np_dtypes(n_samples, n_features):
|
for orig, x in np_dtypes(n_samples, n_features):
|
||||||
m0 = xgb.DMatrix(orig)
|
m0 = xgb.DMatrix(orig)
|
||||||
m1 = xgb.DMatrix(x)
|
m1 = xgb.DMatrix(x)
|
||||||
csr0 = m0.get_data()
|
assert tm.predictor_equal(m0, m1)
|
||||||
csr1 = m1.get_data()
|
|
||||||
np.testing.assert_allclose(csr0.data, csr1.data)
|
|
||||||
np.testing.assert_allclose(csr0.indptr, csr1.indptr)
|
|
||||||
np.testing.assert_allclose(csr0.indices, csr1.indices)
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from xgboost.testing import (
|
|||||||
make_batches_sparse,
|
make_batches_sparse,
|
||||||
make_categorical,
|
make_categorical,
|
||||||
make_sparse_regression,
|
make_sparse_regression,
|
||||||
|
predictor_equal,
|
||||||
)
|
)
|
||||||
from xgboost.testing.data import np_dtypes
|
from xgboost.testing.data import np_dtypes
|
||||||
|
|
||||||
@ -246,11 +247,7 @@ class TestQuantileDMatrix:
|
|||||||
for orig, x in np_dtypes(n_samples, n_features):
|
for orig, x in np_dtypes(n_samples, n_features):
|
||||||
m0 = xgb.QuantileDMatrix(orig)
|
m0 = xgb.QuantileDMatrix(orig)
|
||||||
m1 = xgb.QuantileDMatrix(x)
|
m1 = xgb.QuantileDMatrix(x)
|
||||||
csr0 = m0.get_data()
|
assert predictor_equal(m0, m1)
|
||||||
csr1 = m1.get_data()
|
|
||||||
np.testing.assert_allclose(csr0.data, csr1.data)
|
|
||||||
np.testing.assert_allclose(csr0.indptr, csr1.indptr)
|
|
||||||
np.testing.assert_allclose(csr0.indices, csr1.indices)
|
|
||||||
|
|
||||||
# unsupported types
|
# unsupported types
|
||||||
for dtype in [
|
for dtype in [
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import tempfile
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
from test_dmatrix import set_base_margin_info
|
from test_dmatrix import set_base_margin_info
|
||||||
|
from xgboost.testing.data import pd_dtypes
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost import testing as tm
|
from xgboost import testing as tm
|
||||||
@ -297,70 +298,22 @@ class TestPandas:
|
|||||||
assert 'auc' not in cv.columns[0]
|
assert 'auc' not in cv.columns[0]
|
||||||
assert 'error' in cv.columns[0]
|
assert 'error' in cv.columns[0]
|
||||||
|
|
||||||
def test_nullable_type(self):
|
def test_nullable_type(self) -> None:
|
||||||
y = np.random.default_rng(0).random(4)
|
from pandas.api.types import is_categorical
|
||||||
|
|
||||||
def to_bytes(Xy: xgb.DMatrix) -> bytes:
|
for DMatrixT in (xgb.DMatrix, xgb.QuantileDMatrix):
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
for orig, df in pd_dtypes():
|
||||||
path = os.path.join(tmpdir, "Xy.dmatrix")
|
enable_categorical = any(is_categorical for dtype in df.dtypes)
|
||||||
Xy.save_binary(path)
|
|
||||||
with open(path, "rb") as fd:
|
|
||||||
result = fd.read()
|
|
||||||
return result
|
|
||||||
|
|
||||||
def test_int(dtype) -> bytes:
|
m_orig = DMatrixT(orig, enable_categorical=enable_categorical)
|
||||||
arr = pd.DataFrame(
|
# extension types
|
||||||
{"f0": [1, 2, None, 3], "f1": [4, 3, None, 1]}, dtype=dtype
|
m_etype = DMatrixT(df, enable_categorical=enable_categorical)
|
||||||
)
|
# different from pd.BooleanDtype(), None is converted to False with bool
|
||||||
Xy = xgb.DMatrix(arr, y)
|
if any(dtype == "bool" for dtype in orig.dtypes):
|
||||||
Xy.feature_types = None
|
assert not tm.predictor_equal(m_orig, m_etype)
|
||||||
return to_bytes(Xy)
|
else:
|
||||||
|
assert tm.predictor_equal(m_orig, m_etype)
|
||||||
|
|
||||||
b0 = test_int(np.float32)
|
f0 = df["f0"]
|
||||||
b1 = test_int(pd.Int16Dtype())
|
with pytest.raises(ValueError, match="Label contains NaN"):
|
||||||
assert b0 == b1
|
xgb.DMatrix(df, f0, enable_categorical=enable_categorical)
|
||||||
|
|
||||||
def test_bool(dtype) -> bytes:
|
|
||||||
arr = pd.DataFrame(
|
|
||||||
{"f0": [True, False, None, True], "f1": [False, True, None, True]},
|
|
||||||
dtype=dtype,
|
|
||||||
)
|
|
||||||
Xy = xgb.DMatrix(arr, y)
|
|
||||||
Xy.feature_types = None
|
|
||||||
return to_bytes(Xy)
|
|
||||||
|
|
||||||
b0 = test_bool(pd.BooleanDtype())
|
|
||||||
b1 = test_bool(bool)
|
|
||||||
assert b0 != b1 # None is converted to False with np.bool
|
|
||||||
|
|
||||||
data = {"f0": [1.0, 2.0, None, 3.0], "f1": [3.0, 2.0, None, 1.0]}
|
|
||||||
|
|
||||||
arr = np.array([data["f0"], data["f1"]]).T
|
|
||||||
Xy = xgb.DMatrix(arr, y)
|
|
||||||
Xy.feature_types = None
|
|
||||||
Xy.feature_names = None
|
|
||||||
from_np = to_bytes(Xy)
|
|
||||||
|
|
||||||
def test_float(dtype) -> bytes:
|
|
||||||
arr = pd.DataFrame(data, dtype=dtype)
|
|
||||||
Xy = xgb.DMatrix(arr, y)
|
|
||||||
Xy.feature_types = None
|
|
||||||
Xy.feature_names = None
|
|
||||||
return to_bytes(Xy)
|
|
||||||
|
|
||||||
b0 = test_float(pd.Float64Dtype())
|
|
||||||
b1 = test_float(float)
|
|
||||||
assert b0 == b1 # both are converted to NaN
|
|
||||||
assert b0 == from_np
|
|
||||||
|
|
||||||
def test_cat(dtype) -> bytes:
|
|
||||||
arr = pd.DataFrame(data, dtype=dtype)
|
|
||||||
if dtype is None:
|
|
||||||
arr = arr.astype("category")
|
|
||||||
Xy = xgb.DMatrix(arr, y, enable_categorical=True)
|
|
||||||
Xy.feature_types = None
|
|
||||||
return to_bytes(Xy)
|
|
||||||
|
|
||||||
b0 = test_cat(pd.CategoricalDtype())
|
|
||||||
b1 = test_cat(None)
|
|
||||||
assert b0 == b1
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user