Fix mixed types with cuDF. (#8280)
This commit is contained in:
parent
f835368bcf
commit
6925b222e0
@ -1,49 +1,72 @@
|
||||
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
|
||||
# pylint: disable=too-many-lines, too-many-locals
|
||||
"""Core XGBoost Library."""
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
import copy
|
||||
from typing import List, Optional, Any, Union, Dict, TypeVar
|
||||
from typing import Callable, Tuple, cast, Sequence, Type, Iterable
|
||||
import ctypes
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import warnings
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
from functools import wraps
|
||||
from inspect import signature, Parameter
|
||||
from inspect import Parameter, signature
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from .compat import DataFrame, py_str, PANDAS_INSTALLED
|
||||
from .libpath import find_lib_path
|
||||
from ._typing import (
|
||||
CStrPptr,
|
||||
c_bst_ulong,
|
||||
_T,
|
||||
ArrayLike,
|
||||
BoosterParam,
|
||||
CFloatPtr,
|
||||
CNumeric,
|
||||
DataType,
|
||||
CNumericPtr,
|
||||
CStrPptr,
|
||||
CStrPtr,
|
||||
CTypeT,
|
||||
ArrayLike,
|
||||
CFloatPtr,
|
||||
NumpyOrCupy,
|
||||
FeatureInfo,
|
||||
FeatureTypes,
|
||||
FeatureNames,
|
||||
_T,
|
||||
CupyT,
|
||||
BoosterParam
|
||||
DataType,
|
||||
FeatureInfo,
|
||||
FeatureNames,
|
||||
FeatureTypes,
|
||||
NumpyOrCupy,
|
||||
c_bst_ulong,
|
||||
)
|
||||
from .compat import PANDAS_INSTALLED, DataFrame, py_str
|
||||
from .libpath import find_lib_path
|
||||
|
||||
|
||||
class XGBoostError(ValueError):
|
||||
"""Error thrown by xgboost trainer."""
|
||||
|
||||
|
||||
@overload
|
||||
def from_pystr_to_cstr(data: str) -> bytes:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def from_pystr_to_cstr(data: List[str]) -> ctypes.Array:
|
||||
...
|
||||
|
||||
|
||||
def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]:
|
||||
"""Convert a Python str or list of Python str to C pointer
|
||||
|
||||
|
||||
@ -3,24 +3,33 @@
|
||||
'''Data dispatching for DMatrix.'''
|
||||
import ctypes
|
||||
import json
|
||||
import warnings
|
||||
import os
|
||||
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast
|
||||
import warnings
|
||||
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .core import c_array, _LIB, _check_call, c_str
|
||||
from .core import _cuda_array_interface
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||
from .compat import lazy_isinstance, DataFrame
|
||||
from ._typing import (
|
||||
c_bst_ulong,
|
||||
DataType,
|
||||
FeatureTypes,
|
||||
FeatureNames,
|
||||
NumpyDType,
|
||||
CupyT,
|
||||
FloatCompatible, PandasDType
|
||||
DataType,
|
||||
FeatureNames,
|
||||
FeatureTypes,
|
||||
FloatCompatible,
|
||||
NumpyDType,
|
||||
PandasDType,
|
||||
c_bst_ulong,
|
||||
)
|
||||
from .compat import DataFrame, lazy_isinstance
|
||||
from .core import (
|
||||
_LIB,
|
||||
DataIter,
|
||||
DMatrix,
|
||||
_check_call,
|
||||
_cuda_array_interface,
|
||||
_ProxyDMatrix,
|
||||
c_array,
|
||||
c_str,
|
||||
from_pystr_to_cstr,
|
||||
)
|
||||
|
||||
DispatchedDataBackendReturnType = Tuple[
|
||||
@ -631,10 +640,10 @@ def _is_cudf_df(data: DataType) -> bool:
|
||||
|
||||
|
||||
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
||||
data and a list of array interfaces. The data is list of categorical codes that
|
||||
caller can safely ignore, but have to keep their reference alive until usage of array
|
||||
interface is finished.
|
||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list
|
||||
of data and a list of array interfaces. The data is list of categorical codes that
|
||||
caller can safely ignore, but have to keep their reference alive until usage of
|
||||
array interface is finished.
|
||||
|
||||
"""
|
||||
try:
|
||||
@ -643,14 +652,18 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
||||
from cudf.utils.dtypes import is_categorical_dtype
|
||||
|
||||
interfaces = []
|
||||
|
||||
def append(interface: dict) -> None:
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
|
||||
if _is_cudf_ser(data):
|
||||
if is_categorical_dtype(data.dtype):
|
||||
interface = cat_codes[0].__cuda_array_interface__
|
||||
else:
|
||||
interface = data.__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
append(interface)
|
||||
else:
|
||||
for i, col in enumerate(data):
|
||||
if is_categorical_dtype(data[col].dtype):
|
||||
@ -658,10 +671,8 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
||||
interface = codes.__cuda_array_interface__
|
||||
else:
|
||||
interface = data[col].__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
||||
append(interface)
|
||||
interfaces_str = from_pystr_to_cstr(json.dumps(interfaces))
|
||||
return interfaces_str
|
||||
|
||||
|
||||
@ -722,9 +733,14 @@ def _transform_cudf_df(
|
||||
cat_codes.append(codes)
|
||||
else:
|
||||
for col in data:
|
||||
if is_categorical_dtype(data[col].dtype) and enable_categorical:
|
||||
dtype = data[col].dtype
|
||||
if is_categorical_dtype(dtype) and enable_categorical:
|
||||
codes = data[col].cat.codes
|
||||
cat_codes.append(codes)
|
||||
elif is_categorical_dtype(dtype):
|
||||
raise ValueError(_ENABLE_CAT_ERR)
|
||||
else:
|
||||
cat_codes.append([])
|
||||
|
||||
return data, cat_codes, feature_names, feature_types
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
@ -176,20 +178,38 @@ Arrow specification.'''
|
||||
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_categorical(self):
|
||||
def test_cudf_categorical(self) -> None:
|
||||
import cudf
|
||||
_X, _y = tm.make_categorical(100, 30, 17, False)
|
||||
n_features = 30
|
||||
_X, _y = tm.make_categorical(100, n_features, 17, False)
|
||||
X = cudf.from_pandas(_X)
|
||||
y = cudf.from_pandas(_y)
|
||||
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert Xy.feature_types is not None
|
||||
assert len(Xy.feature_types) == X.shape[1]
|
||||
assert all(t == "c" for t in Xy.feature_types)
|
||||
|
||||
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
|
||||
assert Xy.feature_types is not None
|
||||
assert len(Xy.feature_types) == X.shape[1]
|
||||
assert all(t == "c" for t in Xy.feature_types)
|
||||
|
||||
# mixed dtypes
|
||||
X["1"] = X["1"].astype(np.int64)
|
||||
X["3"] = X["3"].astype(np.int64)
|
||||
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
|
||||
X, None, None, enable_categorical=True
|
||||
)
|
||||
assert X.shape[1] == n_features
|
||||
assert len(cat_codes) == X.shape[1]
|
||||
assert not cat_codes[0]
|
||||
assert not cat_codes[2]
|
||||
|
||||
interfaces_str = xgb.data._cudf_array_interfaces(df, cat_codes)
|
||||
interfaces = json.loads(interfaces_str)
|
||||
assert len(interfaces) == X.shape[1]
|
||||
|
||||
# test missing value
|
||||
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||
X["f0"] = X["f0"].astype("category")
|
||||
@ -206,7 +226,7 @@ Arrow specification.'''
|
||||
assert Xy.num_row() == 3
|
||||
assert Xy.num_col() == 1
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(ValueError, match="enable_categorical"):
|
||||
xgb.DeviceQuantileDMatrix(X, y)
|
||||
|
||||
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user