Fixes for the latest pandas. (#10266)

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan 2024-05-12 11:15:46 +08:00 committed by GitHub
parent 5e816e616a
commit d81e319e78
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 28 deletions

View File

@ -370,10 +370,8 @@ def pandas_feature_info(
if feature_names is None and meta is None: if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex): if isinstance(data.columns, pd.MultiIndex):
feature_names = [" ".join([str(x) for x in i]) for i in data.columns] feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
elif isinstance(data.columns, (pd.Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns))
else: else:
feature_names = data.columns.format() feature_names = list(data.columns.map(str))
# handle feature types # handle feature types
if feature_types is None and meta is None: if feature_types is None and meta is None:
@ -865,6 +863,22 @@ def _is_cudf_df(data: DataType) -> bool:
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
def _get_cudf_cat_predicate() -> Callable[[Any], bool]:
try:
from cudf import CategoricalDtype
def is_categorical_dtype(dtype: Any) -> bool:
return isinstance(dtype, CategoricalDtype)
except ImportError:
try:
from cudf.api.types import is_categorical_dtype # type: ignore
except ImportError:
from cudf.utils.dtypes import is_categorical_dtype # type: ignore
return is_categorical_dtype
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list """Extract CuDF __cuda_array_interface__. This is special as it returns a new list
of data and a list of array interfaces. The data is list of categorical codes that of data and a list of array interfaces. The data is list of categorical codes that
@ -872,11 +886,7 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
array interface is finished. array interface is finished.
""" """
try: is_categorical_dtype = _get_cudf_cat_predicate()
from cudf.api.types import is_categorical_dtype
except ImportError:
from cudf.utils.dtypes import is_categorical_dtype
interfaces = [] interfaces = []
def append(interface: dict) -> None: def append(interface: dict) -> None:
@ -908,12 +918,13 @@ def _transform_cudf_df(
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
try: try:
from cudf.api.types import is_bool_dtype, is_categorical_dtype from cudf.api.types import is_bool_dtype
except ImportError: except ImportError:
from cudf.utils.dtypes import is_categorical_dtype
from pandas.api.types import is_bool_dtype from pandas.api.types import is_bool_dtype
is_categorical_dtype = _get_cudf_cat_predicate()
# Work around https://github.com/dmlc/xgboost/issues/10181 # Work around https://github.com/dmlc/xgboost/issues/10181
if _is_cudf_ser(data): if _is_cudf_ser(data):
if is_bool_dtype(data.dtype): if is_bool_dtype(data.dtype):
@ -941,15 +952,8 @@ def _transform_cudf_df(
feature_names = [data.name] feature_names = [data.name]
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"): elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
feature_names = [" ".join([str(x) for x in i]) for i in data.columns] feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
elif (
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
# Unique to cuDF, no equivalence in pandas 1.3.3
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
):
feature_names = list(map(str, data.columns))
else: else:
feature_names = data.columns.format() feature_names = list(data.columns.map(str))
# handle feature types # handle feature types
if feature_types is None: if feature_types is None:

View File

@ -280,8 +280,10 @@ class TestPandas:
} }
) )
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
dtrain = xgb.DMatrix(X, y) dtrain = xgb.DMatrix(X, y)
booster = xgb.train({}, dtrain, num_boost_round=4) booster = xgb.train({}, dtrain, num_boost_round=4)
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
predt_sparse = booster.predict(xgb.DMatrix(X)) predt_sparse = booster.predict(xgb.DMatrix(X))
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense())) predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
np.testing.assert_allclose(predt_sparse, predt_dense) np.testing.assert_allclose(predt_sparse, predt_dense)
@ -572,8 +574,10 @@ class TestPandas:
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
def verify_pandas_sparse(): def verify_pandas_sparse():
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL) dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
booster = xgb.train({}, dtrain, num_boost_round=4) booster = xgb.train({}, dtrain, num_boost_round=4)
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
predt_sparse = booster.predict( predt_sparse = booster.predict(
xgb.DMatrix(X, data_split_mode=DataSplitMode.COL) xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
) )