Support pandas 2.1.0. (#9557)

This commit is contained in:
Jiaming Yuan 2023-09-11 17:44:51 +08:00 committed by GitHub
parent 66a0832778
commit 9027686cac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 27 deletions

View File

@ -317,7 +317,6 @@ def pandas_feature_info(
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
"""Handle feature info for pandas dataframe.""" """Handle feature info for pandas dataframe."""
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype, is_sparse
# handle feature names # handle feature names
if feature_names is None and meta is None: if feature_names is None and meta is None:
@ -332,10 +331,10 @@ def pandas_feature_info(
if feature_types is None and meta is None: if feature_types is None and meta is None:
feature_types = [] feature_types = []
for dtype in data.dtypes: for dtype in data.dtypes:
if is_sparse(dtype): if is_pd_sparse_dtype(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif ( elif (
is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
) and enable_categorical: ) and enable_categorical:
feature_types.append(CAT_T) feature_types.append(CAT_T)
else: else:
@ -345,18 +344,13 @@ def pandas_feature_info(
def is_nullable_dtype(dtype: PandasDType) -> bool: def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Whether dtype is a pandas nullable type.""" """Whether dtype is a pandas nullable type."""
from pandas.api.types import ( from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
is_bool_dtype,
is_categorical_dtype,
is_float_dtype,
is_integer_dtype,
)
is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
return is_int or is_bool or is_float or is_categorical_dtype(dtype) return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
def is_pa_ext_dtype(dtype: Any) -> bool: def is_pa_ext_dtype(dtype: Any) -> bool:
@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
) )
def is_pd_cat_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas category type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import CategoricalDtype
return isinstance(dtype, CategoricalDtype)
from pandas.api.types import is_categorical_dtype
return is_categorical_dtype(dtype)
def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas sparse type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import SparseDtype
return isinstance(dtype, SparseDtype)
from pandas.api.types import is_sparse
return is_sparse(dtype)
def pandas_cat_null(data: DataFrame) -> DataFrame: def pandas_cat_null(data: DataFrame) -> DataFrame:
"""Handle categorical dtype and nullable extension types from pandas.""" """Handle categorical dtype and nullable extension types from pandas."""
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype
# handle category codes and nullable. # handle category codes and nullable.
cat_columns = [] cat_columns = []
nul_columns = [] nul_columns = []
# avoid an unnecessary conversion if possible # avoid an unnecessary conversion if possible
for col, dtype in zip(data.columns, data.dtypes): for col, dtype in zip(data.columns, data.dtypes):
if is_categorical_dtype(dtype): if is_pd_cat_dtype(dtype):
cat_columns.append(col) cat_columns.append(col)
elif is_pa_ext_categorical_dtype(dtype): elif is_pa_ext_categorical_dtype(dtype):
raise ValueError( raise ValueError(
@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
transformed = data transformed = data
def cat_codes(ser: pd.Series) -> pd.Series: def cat_codes(ser: pd.Series) -> pd.Series:
if is_categorical_dtype(ser.dtype): if is_pd_cat_dtype(ser.dtype):
return ser.cat.codes return ser.cat.codes
assert is_pa_ext_categorical_dtype(ser.dtype) assert is_pa_ext_categorical_dtype(ser.dtype)
# Not yet supported, the index is not ordered for some reason. Alternately: # Not yet supported, the index is not ordered for some reason. Alternately:
@ -454,14 +479,12 @@ def _transform_pandas_df(
meta: Optional[str] = None, meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None, meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
from pandas.api.types import is_categorical_dtype, is_sparse
pyarrow_extension = False pyarrow_extension = False
for dtype in data.dtypes: for dtype in data.dtypes:
if not ( if not (
(dtype.name in _pandas_dtype_mapper) (dtype.name in _pandas_dtype_mapper)
or is_sparse(dtype) or is_pd_sparse_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical) or (is_pd_cat_dtype(dtype) and enable_categorical)
or is_pa_ext_dtype(dtype) or is_pa_ext_dtype(dtype)
): ):
_invalid_dataframe_dtype(data) _invalid_dataframe_dtype(data)
@ -515,9 +538,8 @@ def _meta_from_pandas_series(
) -> None: ) -> None:
"""Help transform pandas series for meta data like labels""" """Help transform pandas series for meta data like labels"""
data = data.values.astype("float") data = data.values.astype("float")
from pandas.api.types import is_sparse
if is_sparse(data): if is_pd_sparse_dtype(getattr(data, "dtype", data)):
data = data.to_dense() # type: ignore data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
@ -539,13 +561,11 @@ def _from_pandas_series(
feature_names: Optional[FeatureNames], feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType: ) -> DispatchedDataBackendReturnType:
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not ( if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical is_pd_cat_dtype(data.dtype) and enable_categorical
): ):
_invalid_dataframe_dtype(data) _invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype): if enable_categorical and is_pd_cat_dtype(data.dtype):
data = data.cat.codes data = data.cat.codes
return _from_numpy_array( return _from_numpy_array(
data.values.reshape(data.shape[0], 1).astype("float"), data.values.reshape(data.shape[0], 1).astype("float"),

View File

@ -211,7 +211,7 @@ class TestPandas:
y = np.random.randn(kRows) y = np.random.randn(kRows)
w = np.random.uniform(size=kRows).astype(np.float32) w = np.random.uniform(size=kRows).astype(np.float32)
w_pd = pd.DataFrame(w) w_pd = pd.DataFrame(w)
data = xgb.DMatrix(X, y, w_pd) data = xgb.DMatrix(X, y, weight=w_pd)
assert data.num_row() == kRows assert data.num_row() == kRows
assert data.num_col() == kCols assert data.num_col() == kCols
@ -301,14 +301,14 @@ class TestPandas:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix]) @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None: def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical_dtype from xgboost.data import is_pd_cat_dtype
for orig, df in pd_dtypes(): for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"): if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical_dtype for dtype in df.dtypes) enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
else: else:
# series # series
enable_categorical = is_categorical_dtype(df.dtype) enable_categorical = is_pd_cat_dtype(df.dtype)
f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df