Fixes for the latest pandas. (#10266)
Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
parent
5e816e616a
commit
d81e319e78
@ -370,10 +370,8 @@ def pandas_feature_info(
|
|||||||
if feature_names is None and meta is None:
|
if feature_names is None and meta is None:
|
||||||
if isinstance(data.columns, pd.MultiIndex):
|
if isinstance(data.columns, pd.MultiIndex):
|
||||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||||
elif isinstance(data.columns, (pd.Index, pd.RangeIndex)):
|
|
||||||
feature_names = list(map(str, data.columns))
|
|
||||||
else:
|
else:
|
||||||
feature_names = data.columns.format()
|
feature_names = list(data.columns.map(str))
|
||||||
|
|
||||||
# handle feature types
|
# handle feature types
|
||||||
if feature_types is None and meta is None:
|
if feature_types is None and meta is None:
|
||||||
@ -865,6 +863,22 @@ def _is_cudf_df(data: DataType) -> bool:
|
|||||||
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_cudf_cat_predicate() -> Callable[[Any], bool]:
|
||||||
|
try:
|
||||||
|
from cudf import CategoricalDtype
|
||||||
|
|
||||||
|
def is_categorical_dtype(dtype: Any) -> bool:
|
||||||
|
return isinstance(dtype, CategoricalDtype)
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
from cudf.api.types import is_categorical_dtype # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
from cudf.utils.dtypes import is_categorical_dtype # type: ignore
|
||||||
|
|
||||||
|
return is_categorical_dtype
|
||||||
|
|
||||||
|
|
||||||
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
||||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list
|
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list
|
||||||
of data and a list of array interfaces. The data is list of categorical codes that
|
of data and a list of array interfaces. The data is list of categorical codes that
|
||||||
@ -872,11 +886,7 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
|||||||
array interface is finished.
|
array interface is finished.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
is_categorical_dtype = _get_cudf_cat_predicate()
|
||||||
from cudf.api.types import is_categorical_dtype
|
|
||||||
except ImportError:
|
|
||||||
from cudf.utils.dtypes import is_categorical_dtype
|
|
||||||
|
|
||||||
interfaces = []
|
interfaces = []
|
||||||
|
|
||||||
def append(interface: dict) -> None:
|
def append(interface: dict) -> None:
|
||||||
@ -908,12 +918,13 @@ def _transform_cudf_df(
|
|||||||
feature_types: Optional[FeatureTypes],
|
feature_types: Optional[FeatureTypes],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
|
) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from cudf.api.types import is_bool_dtype, is_categorical_dtype
|
from cudf.api.types import is_bool_dtype
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from cudf.utils.dtypes import is_categorical_dtype
|
|
||||||
from pandas.api.types import is_bool_dtype
|
from pandas.api.types import is_bool_dtype
|
||||||
|
|
||||||
|
is_categorical_dtype = _get_cudf_cat_predicate()
|
||||||
# Work around https://github.com/dmlc/xgboost/issues/10181
|
# Work around https://github.com/dmlc/xgboost/issues/10181
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
if is_bool_dtype(data.dtype):
|
if is_bool_dtype(data.dtype):
|
||||||
@ -941,15 +952,8 @@ def _transform_cudf_df(
|
|||||||
feature_names = [data.name]
|
feature_names = [data.name]
|
||||||
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
||||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||||
elif (
|
|
||||||
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
|
|
||||||
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
|
|
||||||
# Unique to cuDF, no equivalence in pandas 1.3.3
|
|
||||||
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
|
|
||||||
):
|
|
||||||
feature_names = list(map(str, data.columns))
|
|
||||||
else:
|
else:
|
||||||
feature_names = data.columns.format()
|
feature_names = list(data.columns.map(str))
|
||||||
|
|
||||||
# handle feature types
|
# handle feature types
|
||||||
if feature_types is None:
|
if feature_types is None:
|
||||||
|
|||||||
@ -280,10 +280,12 @@ class TestPandas:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
|
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
|
||||||
dtrain = xgb.DMatrix(X, y)
|
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
|
||||||
|
dtrain = xgb.DMatrix(X, y)
|
||||||
booster = xgb.train({}, dtrain, num_boost_round=4)
|
booster = xgb.train({}, dtrain, num_boost_round=4)
|
||||||
predt_sparse = booster.predict(xgb.DMatrix(X))
|
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
|
||||||
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
|
predt_sparse = booster.predict(xgb.DMatrix(X))
|
||||||
|
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
|
||||||
np.testing.assert_allclose(predt_sparse, predt_dense)
|
np.testing.assert_allclose(predt_sparse, predt_dense)
|
||||||
|
|
||||||
def test_pandas_label(
|
def test_pandas_label(
|
||||||
@ -572,14 +574,16 @@ class TestPandas:
|
|||||||
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
|
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
|
||||||
|
|
||||||
def verify_pandas_sparse():
|
def verify_pandas_sparse():
|
||||||
dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
|
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
|
||||||
|
dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
|
||||||
booster = xgb.train({}, dtrain, num_boost_round=4)
|
booster = xgb.train({}, dtrain, num_boost_round=4)
|
||||||
predt_sparse = booster.predict(
|
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
|
||||||
xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
|
predt_sparse = booster.predict(
|
||||||
)
|
xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
|
||||||
predt_dense = booster.predict(
|
)
|
||||||
xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL)
|
predt_dense = booster.predict(
|
||||||
)
|
xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL)
|
||||||
|
)
|
||||||
np.testing.assert_allclose(predt_sparse, predt_dense)
|
np.testing.assert_allclose(predt_sparse, predt_dense)
|
||||||
|
|
||||||
tm.run_with_rabit(world_size=3, test_fn=verify_pandas_sparse)
|
tm.run_with_rabit(world_size=3, test_fn=verify_pandas_sparse)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user