* Handle missing values in dataframe with category dtype. (#7331) * Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching. * Fix pick error.
This commit is contained in:
parent
11f8b5cfcd
commit
14c56f05da
@ -386,7 +386,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
|||||||
raise exc # pylint: disable=raising-bad-type
|
raise exc # pylint: disable=raising-bad-type
|
||||||
|
|
||||||
def __del__(self) -> None:
|
def __del__(self) -> None:
|
||||||
assert self._temporary_data is None, self._temporary_data
|
assert self._temporary_data is None
|
||||||
assert self._exception is None
|
assert self._exception is None
|
||||||
|
|
||||||
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
|
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
|
||||||
@ -410,19 +410,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
|||||||
feature_names: Optional[List[str]] = None,
|
feature_names: Optional[List[str]] = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
) -> None:
|
||||||
from .data import dispatch_proxy_set_data
|
from .data import dispatch_proxy_set_data
|
||||||
from .data import _proxy_transform
|
from .data import _proxy_transform
|
||||||
|
|
||||||
transformed, feature_names, feature_types = _proxy_transform(
|
new, cat_codes, feature_names, feature_types = _proxy_transform(
|
||||||
data,
|
data,
|
||||||
feature_names,
|
feature_names,
|
||||||
feature_types,
|
feature_types,
|
||||||
self._enable_categorical,
|
self._enable_categorical,
|
||||||
)
|
)
|
||||||
# Stage the data, meta info are copied inside C++ MetaInfo.
|
# Stage the data, meta info are copied inside C++ MetaInfo.
|
||||||
self._temporary_data = transformed
|
self._temporary_data = (new, cat_codes)
|
||||||
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
|
dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
|
||||||
self.proxy.set_info(
|
self.proxy.set_info(
|
||||||
feature_names=feature_names,
|
feature_names=feature_names,
|
||||||
feature_types=feature_types,
|
feature_types=feature_types,
|
||||||
@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
|
|||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
||||||
|
|
||||||
def _set_data_from_cuda_interface(self, data):
|
def _set_data_from_cuda_interface(self, data) -> None:
|
||||||
"""Set data from CUDA array interface."""
|
"""Set data from CUDA array interface."""
|
||||||
interface = data.__cuda_array_interface__
|
interface = data.__cuda_array_interface__
|
||||||
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
||||||
@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
|
|||||||
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
|
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _set_data_from_cuda_columnar(self, data):
|
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
|
||||||
"""Set data from CUDA columnar format."""
|
"""Set data from CUDA columnar format."""
|
||||||
from .data import _cudf_array_interfaces
|
from .data import _cudf_array_interfaces
|
||||||
|
|
||||||
_, interfaces_str = _cudf_array_interfaces(data)
|
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||||
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
|
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
|
||||||
|
|
||||||
def _set_data_from_array(self, data: np.ndarray):
|
def _set_data_from_array(self, data: np.ndarray):
|
||||||
@ -1986,13 +1986,6 @@ class Booster(object):
|
|||||||
preds = ctypes.POINTER(ctypes.c_float)()
|
preds = ctypes.POINTER(ctypes.c_float)()
|
||||||
|
|
||||||
# once caching is supported, we can pass id(data) as cache id.
|
# once caching is supported, we can pass id(data) as cache id.
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
if isinstance(data, pd.DataFrame):
|
|
||||||
data = data.values
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
args = {
|
args = {
|
||||||
"type": 0,
|
"type": 0,
|
||||||
"training": False,
|
"training": False,
|
||||||
@ -2027,7 +2020,20 @@ class Booster(object):
|
|||||||
f"got {data.shape[1]}"
|
f"got {data.shape[1]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .data import _is_pandas_df, _transform_pandas_df
|
||||||
from .data import _array_interface
|
from .data import _array_interface
|
||||||
|
if (
|
||||||
|
_is_pandas_df(data)
|
||||||
|
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||||
|
):
|
||||||
|
ft = self.feature_types
|
||||||
|
if ft is None:
|
||||||
|
enable_categorical = False
|
||||||
|
else:
|
||||||
|
enable_categorical = any(f == "c" for f in ft)
|
||||||
|
if _is_pandas_df(data):
|
||||||
|
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||||
|
|
||||||
if isinstance(data, np.ndarray):
|
if isinstance(data, np.ndarray):
|
||||||
from .data import _ensure_np_dtype
|
from .data import _ensure_np_dtype
|
||||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||||
@ -2080,9 +2086,11 @@ class Booster(object):
|
|||||||
)
|
)
|
||||||
return _prediction_output(shape, dims, preds, True)
|
return _prediction_output(shape, dims, preds, True)
|
||||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
||||||
from .data import _cudf_array_interfaces
|
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||||
|
data, cat_codes, _, _ = _transform_cudf_df(
|
||||||
_, interfaces_str = _cudf_array_interfaces(data)
|
data, None, None, enable_categorical
|
||||||
|
)
|
||||||
|
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||||
_check_call(
|
_check_call(
|
||||||
_LIB.XGBoosterPredictFromCudaColumnar(
|
_LIB.XGBoosterPredictFromCudaColumnar(
|
||||||
self.handle,
|
self.handle,
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# pylint: disable=too-many-arguments, too-many-branches
|
# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
|
||||||
# pylint: disable=too-many-return-statements, import-error
|
# pylint: disable=too-many-return-statements, import-error
|
||||||
'''Data dispatching for DMatrix.'''
|
'''Data dispatching for DMatrix.'''
|
||||||
import ctypes
|
import ctypes
|
||||||
@ -12,7 +12,7 @@ import numpy as np
|
|||||||
from .core import c_array, _LIB, _check_call, c_str
|
from .core import c_array, _LIB, _check_call, c_str
|
||||||
from .core import _cuda_array_interface
|
from .core import _cuda_array_interface
|
||||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||||
from .compat import lazy_isinstance
|
from .compat import lazy_isinstance, DataFrame
|
||||||
|
|
||||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||||
|
|
||||||
@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _invalid_dataframe_dtype(data) -> None:
|
||||||
|
# pandas series has `dtypes` but it's just a single object
|
||||||
|
# cudf series doesn't have `dtypes`.
|
||||||
|
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
|
||||||
|
bad_fields = [
|
||||||
|
str(data.columns[i])
|
||||||
|
for i, dtype in enumerate(data.dtypes)
|
||||||
|
if dtype.name not in _pandas_dtype_mapper
|
||||||
|
]
|
||||||
|
err = " Invalid columns:" + ", ".join(bad_fields)
|
||||||
|
else:
|
||||||
|
err = ""
|
||||||
|
|
||||||
|
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||||
|
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||||
|
be set to `True`.""" + err
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
def _transform_pandas_df(
|
def _transform_pandas_df(
|
||||||
data,
|
data: DataFrame,
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
feature_names: Optional[List[str]] = None,
|
feature_names: Optional[List[str]] = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
meta=None,
|
meta: Optional[str] = None,
|
||||||
meta_type=None,
|
meta_type: Optional[str] = None,
|
||||||
):
|
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||||
|
|
||||||
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
|
if not all(
|
||||||
(is_categorical_dtype(dtype) and enable_categorical)
|
dtype.name in _pandas_dtype_mapper
|
||||||
for dtype in data.dtypes):
|
or is_sparse(dtype)
|
||||||
bad_fields = [
|
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||||
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
|
for dtype in data.dtypes
|
||||||
if dtype.name not in _pandas_dtype_mapper
|
):
|
||||||
]
|
_invalid_dataframe_dtype(data)
|
||||||
|
|
||||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
|
||||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
|
||||||
be set to `True`."""
|
|
||||||
raise ValueError(msg + ', '.join(bad_fields))
|
|
||||||
|
|
||||||
# handle feature names
|
# handle feature names
|
||||||
if feature_names is None and meta is None:
|
if feature_names is None and meta is None:
|
||||||
if isinstance(data.columns, pd.MultiIndex):
|
if isinstance(data.columns, pd.MultiIndex):
|
||||||
feature_names = [
|
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||||
' '.join([str(x) for x in i]) for i in data.columns
|
|
||||||
]
|
|
||||||
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
||||||
feature_names = list(map(str, data.columns))
|
feature_names = list(map(str, data.columns))
|
||||||
else:
|
else:
|
||||||
@ -263,21 +275,24 @@ def _transform_pandas_df(
|
|||||||
else:
|
else:
|
||||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||||
|
|
||||||
# handle categorical codes.
|
# handle category codes.
|
||||||
transformed = pd.DataFrame()
|
transformed = pd.DataFrame()
|
||||||
if enable_categorical:
|
if enable_categorical:
|
||||||
for i, dtype in enumerate(data.dtypes):
|
for i, dtype in enumerate(data.dtypes):
|
||||||
if is_categorical_dtype(dtype):
|
if is_categorical_dtype(dtype):
|
||||||
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
|
# pandas uses -1 as default missing value for categorical data
|
||||||
|
transformed[data.columns[i]] = (
|
||||||
|
data[data.columns[i]]
|
||||||
|
.cat.codes.astype(np.float32)
|
||||||
|
.replace(-1.0, np.NaN)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
transformed[data.columns[i]] = data[data.columns[i]]
|
transformed[data.columns[i]] = data[data.columns[i]]
|
||||||
else:
|
else:
|
||||||
transformed = data
|
transformed = data
|
||||||
|
|
||||||
if meta and len(data.columns) > 1:
|
if meta and len(data.columns) > 1:
|
||||||
raise ValueError(
|
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||||
f"DataFrame for {meta} cannot have multiple columns"
|
|
||||||
)
|
|
||||||
|
|
||||||
dtype = meta_type if meta_type else np.float32
|
dtype = meta_type if meta_type else np.float32
|
||||||
arr = transformed.values
|
arr = transformed.values
|
||||||
@ -287,7 +302,7 @@ def _transform_pandas_df(
|
|||||||
|
|
||||||
|
|
||||||
def _from_pandas_df(
|
def _from_pandas_df(
|
||||||
data,
|
data: DataFrame,
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
@ -300,6 +315,7 @@ def _from_pandas_df(
|
|||||||
feature_types)
|
feature_types)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _is_pandas_series(data):
|
def _is_pandas_series(data):
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -318,13 +334,26 @@ def _is_modin_series(data):
|
|||||||
|
|
||||||
def _from_pandas_series(
|
def _from_pandas_series(
|
||||||
data,
|
data,
|
||||||
missing,
|
missing: float,
|
||||||
nthread,
|
nthread: int,
|
||||||
|
enable_categorical: bool,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: Optional[List[str]],
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
|
from pandas.api.types import is_categorical_dtype
|
||||||
|
|
||||||
|
if (data.dtype.name not in _pandas_dtype_mapper) and not (
|
||||||
|
is_categorical_dtype(data.dtype) and enable_categorical
|
||||||
|
):
|
||||||
|
_invalid_dataframe_dtype(data)
|
||||||
|
if enable_categorical and is_categorical_dtype(data.dtype):
|
||||||
|
data = data.cat.codes
|
||||||
return _from_numpy_array(
|
return _from_numpy_array(
|
||||||
data.values.astype("float"), missing, nthread, feature_names, feature_types
|
data.values.reshape(data.shape[0], 1).astype("float"),
|
||||||
|
missing,
|
||||||
|
nthread,
|
||||||
|
feature_names,
|
||||||
|
feature_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -428,7 +457,7 @@ def _is_cudf_df(data):
|
|||||||
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
||||||
|
|
||||||
|
|
||||||
def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
||||||
data and a list of array interfaces. The data is list of categorical codes that
|
data and a list of array interfaces. The data is list of categorical codes that
|
||||||
caller can safely ignore, but have to keep their reference alive until usage of array
|
caller can safely ignore, but have to keep their reference alive until usage of array
|
||||||
@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from cudf.utils.dtypes import is_categorical_dtype
|
from cudf.utils.dtypes import is_categorical_dtype
|
||||||
|
|
||||||
cat_codes = []
|
|
||||||
interfaces = []
|
interfaces = []
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
interfaces.append(data.__cuda_array_interface__)
|
if is_categorical_dtype(data.dtype):
|
||||||
|
interface = cat_codes[0].__cuda_array_interface__
|
||||||
|
else:
|
||||||
|
interface = data.__cuda_array_interface__
|
||||||
|
if "mask" in interface:
|
||||||
|
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||||
|
interfaces.append(interface)
|
||||||
else:
|
else:
|
||||||
for col in data:
|
for i, col in enumerate(data):
|
||||||
if is_categorical_dtype(data[col].dtype):
|
if is_categorical_dtype(data[col].dtype):
|
||||||
codes = data[col].cat.codes
|
codes = cat_codes[i]
|
||||||
interface = codes.__cuda_array_interface__
|
interface = codes.__cuda_array_interface__
|
||||||
cat_codes.append(codes)
|
|
||||||
else:
|
else:
|
||||||
interface = data[col].__cuda_array_interface__
|
interface = data[col].__cuda_array_interface__
|
||||||
if "mask" in interface:
|
if "mask" in interface:
|
||||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||||
interfaces.append(interface)
|
interfaces.append(interface)
|
||||||
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
||||||
return cat_codes, interfaces_str
|
return interfaces_str
|
||||||
|
|
||||||
|
|
||||||
def _transform_cudf_df(
|
def _transform_cudf_df(
|
||||||
@ -470,25 +503,57 @@ def _transform_cudf_df(
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from cudf.utils.dtypes import is_categorical_dtype
|
from cudf.utils.dtypes import is_categorical_dtype
|
||||||
|
|
||||||
|
if _is_cudf_ser(data):
|
||||||
|
dtypes = [data.dtype]
|
||||||
|
else:
|
||||||
|
dtypes = data.dtypes
|
||||||
|
|
||||||
|
if not all(
|
||||||
|
dtype.name in _pandas_dtype_mapper
|
||||||
|
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||||
|
for dtype in dtypes
|
||||||
|
):
|
||||||
|
_invalid_dataframe_dtype(data)
|
||||||
|
|
||||||
|
# handle feature names
|
||||||
if feature_names is None:
|
if feature_names is None:
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
feature_names = [data.name]
|
feature_names = [data.name]
|
||||||
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
||||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||||
|
elif (
|
||||||
|
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
|
||||||
|
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
|
||||||
|
# Unique to cuDF, no equivalence in pandas 1.3.3
|
||||||
|
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
|
||||||
|
):
|
||||||
|
feature_names = list(map(str, data.columns))
|
||||||
else:
|
else:
|
||||||
feature_names = data.columns.format()
|
feature_names = data.columns.format()
|
||||||
|
|
||||||
|
# handle feature types
|
||||||
if feature_types is None:
|
if feature_types is None:
|
||||||
feature_types = []
|
feature_types = []
|
||||||
if _is_cudf_ser(data):
|
|
||||||
dtypes = [data.dtype]
|
|
||||||
else:
|
|
||||||
dtypes = data.dtypes
|
|
||||||
for dtype in dtypes:
|
for dtype in dtypes:
|
||||||
if is_categorical_dtype(dtype) and enable_categorical:
|
if is_categorical_dtype(dtype) and enable_categorical:
|
||||||
feature_types.append(CAT_T)
|
feature_types.append(CAT_T)
|
||||||
else:
|
else:
|
||||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||||
return data, feature_names, feature_types
|
|
||||||
|
# handle categorical data
|
||||||
|
cat_codes = []
|
||||||
|
if _is_cudf_ser(data):
|
||||||
|
# unlike pandas, cuDF uses NA for missing data.
|
||||||
|
if is_categorical_dtype(data.dtype) and enable_categorical:
|
||||||
|
codes = data.cat.codes
|
||||||
|
cat_codes.append(codes)
|
||||||
|
else:
|
||||||
|
for col in data:
|
||||||
|
if is_categorical_dtype(data[col].dtype) and enable_categorical:
|
||||||
|
codes = data[col].cat.codes
|
||||||
|
cat_codes.append(codes)
|
||||||
|
|
||||||
|
return data, cat_codes, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
def _from_cudf_df(
|
def _from_cudf_df(
|
||||||
@ -499,10 +564,10 @@ def _from_cudf_df(
|
|||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||||
data, feature_names, feature_types = _transform_cudf_df(
|
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
|
||||||
data, feature_names, feature_types, enable_categorical
|
data, feature_names, feature_types, enable_categorical
|
||||||
)
|
)
|
||||||
_, interfaces_str = _cudf_array_interfaces(data)
|
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||||
handle = ctypes.c_void_p()
|
handle = ctypes.c_void_p()
|
||||||
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
||||||
_check_call(
|
_check_call(
|
||||||
@ -707,8 +772,9 @@ def dispatch_data_backend(
|
|||||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||||
feature_names, feature_types)
|
feature_names, feature_types)
|
||||||
if _is_pandas_series(data):
|
if _is_pandas_series(data):
|
||||||
return _from_pandas_series(data, missing, threads, feature_names,
|
return _from_pandas_series(
|
||||||
feature_types)
|
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||||
|
)
|
||||||
if _is_cudf_df(data) or _is_cudf_ser(data):
|
if _is_cudf_df(data) or _is_cudf_ser(data):
|
||||||
return _from_cudf_df(
|
return _from_cudf_df(
|
||||||
data, missing, threads, feature_names, feature_types, enable_categorical
|
data, missing, threads, feature_names, feature_types, enable_categorical
|
||||||
@ -732,8 +798,9 @@ def dispatch_data_backend(
|
|||||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||||
feature_names, feature_types)
|
feature_names, feature_types)
|
||||||
if _is_modin_series(data):
|
if _is_modin_series(data):
|
||||||
return _from_pandas_series(data, missing, threads, feature_names,
|
return _from_pandas_series(
|
||||||
feature_types)
|
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||||
|
)
|
||||||
if _has_array_protocol(data):
|
if _has_array_protocol(data):
|
||||||
array = np.asarray(data)
|
array = np.asarray(data)
|
||||||
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
||||||
@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
|||||||
_meta_from_dt(data, name, dtype, handle)
|
_meta_from_dt(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_modin_df(data):
|
if _is_modin_df(data):
|
||||||
data, _, _ = _transform_pandas_df(
|
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||||
data, False, meta=name, meta_type=dtype)
|
|
||||||
_meta_from_numpy(data, name, dtype, handle)
|
_meta_from_numpy(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_modin_series(data):
|
if _is_modin_series(data):
|
||||||
@ -917,30 +983,38 @@ def _proxy_transform(
|
|||||||
)
|
)
|
||||||
if _is_cupy_array(data):
|
if _is_cupy_array(data):
|
||||||
data = _transform_cupy_array(data)
|
data = _transform_cupy_array(data)
|
||||||
return data, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_dlpack(data):
|
if _is_dlpack(data):
|
||||||
return _transform_dlpack(data), feature_names, feature_types
|
return _transform_dlpack(data), None, feature_names, feature_types
|
||||||
if _is_numpy_array(data):
|
if _is_numpy_array(data):
|
||||||
return data, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_scipy_csr(data):
|
if _is_scipy_csr(data):
|
||||||
return data, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_pandas_df(data):
|
if _is_pandas_df(data):
|
||||||
arr, feature_names, feature_types = _transform_pandas_df(
|
arr, feature_names, feature_types = _transform_pandas_df(
|
||||||
data, enable_categorical, feature_names, feature_types
|
data, enable_categorical, feature_names, feature_types
|
||||||
)
|
)
|
||||||
return arr, feature_names, feature_types
|
return arr, None, feature_names, feature_types
|
||||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||||
|
|
||||||
|
|
||||||
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
|
def dispatch_proxy_set_data(
|
||||||
|
proxy: _ProxyDMatrix,
|
||||||
|
data: Any,
|
||||||
|
cat_codes: Optional[list],
|
||||||
|
allow_host: bool,
|
||||||
|
) -> None:
|
||||||
"""Dispatch for DeviceQuantileDMatrix."""
|
"""Dispatch for DeviceQuantileDMatrix."""
|
||||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||||
_check_data_shape(data)
|
_check_data_shape(data)
|
||||||
|
|
||||||
if _is_cudf_df(data):
|
if _is_cudf_df(data):
|
||||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
# pylint: disable=W0212
|
||||||
|
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||||
return
|
return
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
# pylint: disable=W0212
|
||||||
|
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||||
return
|
return
|
||||||
if _is_cupy_array(data):
|
if _is_cupy_array(data):
|
||||||
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2020 by XGBoost Contributors
|
* Copyright 2020-2021 by XGBoost Contributors
|
||||||
* \file categorical.h
|
* \file categorical.h
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
||||||
@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
|
|||||||
return !s_cats.Check(cat);
|
return !s_cats.Check(cat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void CheckCat(bst_cat_t cat) {
|
||||||
|
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
|
||||||
|
"should be non-negative.";
|
||||||
|
}
|
||||||
|
|
||||||
struct IsCatOp {
|
struct IsCatOp {
|
||||||
XGBOOST_DEVICE bool operator()(FeatureType ft) {
|
XGBOOST_DEVICE bool operator()(FeatureType ft) {
|
||||||
return ft == FeatureType::kCategorical;
|
return ft == FeatureType::kCategorical;
|
||||||
|
|||||||
@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
|
|||||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||||
dh::device_vector<Entry> *p_sorted_entries,
|
dh::device_vector<Entry> *p_sorted_entries,
|
||||||
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
||||||
|
info.feature_types.SetDevice(device);
|
||||||
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
||||||
CHECK(!d_feature_types.empty());
|
CHECK(!d_feature_types.empty());
|
||||||
auto &column_sizes_scan = *p_column_sizes_scan;
|
auto &column_sizes_scan = *p_column_sizes_scan;
|
||||||
|
|||||||
@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
|
|||||||
|
|
||||||
void SortByWeight(dh::device_vector<float>* weights,
|
void SortByWeight(dh::device_vector<float>* weights,
|
||||||
dh::device_vector<Entry>* sorted_entries);
|
dh::device_vector<Entry>* sorted_entries);
|
||||||
|
|
||||||
|
void RemoveDuplicatedCategories(
|
||||||
|
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||||
|
dh::device_vector<Entry> *p_sorted_entries,
|
||||||
|
dh::caching_device_vector<size_t> *p_column_sizes_scan);
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
// Compute sketch on DMatrix.
|
// Compute sketch on DMatrix.
|
||||||
@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
|
|||||||
size_t sketch_batch_num_elements = 0);
|
size_t sketch_batch_num_elements = 0);
|
||||||
|
|
||||||
template <typename AdapterBatch>
|
template <typename AdapterBatch>
|
||||||
void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||||
size_t begin, size_t end, float missing,
|
int device, size_t columns, size_t begin, size_t end,
|
||||||
SketchContainer* sketch_container, int num_cuts) {
|
float missing, SketchContainer *sketch_container,
|
||||||
|
int num_cuts) {
|
||||||
// Copy current subset of valid elements into temporary storage and sort
|
// Copy current subset of valid elements into temporary storage and sort
|
||||||
dh::device_vector<Entry> sorted_entries;
|
dh::device_vector<Entry> sorted_entries;
|
||||||
dh::caching_device_vector<size_t> column_sizes_scan;
|
dh::caching_device_vector<size_t> column_sizes_scan;
|
||||||
@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
|||||||
thrust::make_counting_iterator(0llu),
|
thrust::make_counting_iterator(0llu),
|
||||||
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
||||||
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
||||||
|
cuts_ptr.SetDevice(device);
|
||||||
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
||||||
columns, num_cuts, device,
|
columns, num_cuts, device,
|
||||||
&cuts_ptr,
|
&cuts_ptr,
|
||||||
@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
|||||||
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
||||||
sorted_entries.end(), detail::EntryCompareOp());
|
sorted_entries.end(), detail::EntryCompareOp());
|
||||||
|
|
||||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
if (sketch_container->HasCategorical()) {
|
||||||
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||||
|
&sorted_entries, &column_sizes_scan);
|
||||||
|
}
|
||||||
|
|
||||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
auto const &h_cuts_ptr = cuts_ptr.HostVector();
|
||||||
// Extract the cuts from all columns concurrently
|
// Extract the cuts from all columns concurrently
|
||||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||||
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
||||||
@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
|||||||
|
|
||||||
detail::SortByWeight(&temp_weights, &sorted_entries);
|
detail::SortByWeight(&temp_weights, &sorted_entries);
|
||||||
|
|
||||||
|
if (sketch_container->HasCategorical()) {
|
||||||
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||||
|
&sorted_entries, &column_sizes_scan);
|
||||||
|
}
|
||||||
|
|
||||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
|
||||||
@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
|||||||
device, num_cuts_per_feature, false);
|
device, num_cuts_per_feature, false);
|
||||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||||
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
||||||
ProcessSlidingWindow(batch, device, num_cols,
|
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
|
||||||
begin, end, missing, sketch_container, num_cuts_per_feature);
|
sketch_container, num_cuts_per_feature);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,6 +21,7 @@
|
|||||||
|
|
||||||
#include "array_interface.h"
|
#include "array_interface.h"
|
||||||
#include "../c_api/c_api_error.h"
|
#include "../c_api/c_api_error.h"
|
||||||
|
#include "../common/math.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
@ -80,6 +81,24 @@ struct COOTuple {
|
|||||||
float value{0};
|
float value{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct IsValidFunctor {
|
||||||
|
float missing;
|
||||||
|
|
||||||
|
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||||
|
|
||||||
|
XGBOOST_DEVICE bool operator()(float value) const {
|
||||||
|
return !(common::CheckNAN(value) || value == missing);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
|
||||||
|
return !(common::CheckNAN(e.value) || e.value == missing);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGBOOST_DEVICE bool operator()(const Entry& e) const {
|
||||||
|
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -987,18 +987,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
|
|
||||||
// Second pass over batch, placing elements in correct position
|
// Second pass over batch, placing elements in correct position
|
||||||
|
|
||||||
|
auto is_valid = data::IsValidFunctor{missing};
|
||||||
#pragma omp parallel num_threads(nthread)
|
#pragma omp parallel num_threads(nthread)
|
||||||
{
|
{
|
||||||
exec.Run([&]() {
|
exec.Run([&]() {
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
size_t begin = tid*thread_size;
|
size_t begin = tid * thread_size;
|
||||||
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
|
||||||
for (size_t i = begin; i < end; ++i) {
|
for (size_t i = begin; i < end; ++i) {
|
||||||
auto line = batch.GetLine(i);
|
auto line = batch.GetLine(i);
|
||||||
for (auto j = 0ull; j < line.Size(); j++) {
|
for (auto j = 0ull; j < line.Size(); j++) {
|
||||||
auto element = line.GetElement(j);
|
auto element = line.GetElement(j);
|
||||||
const size_t key = (element.row_idx - base_rowid);
|
const size_t key = (element.row_idx - base_rowid);
|
||||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
if (is_valid(element)) {
|
||||||
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,29 +15,6 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
|
|
||||||
float missing;
|
|
||||||
|
|
||||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
|
||||||
|
|
||||||
__device__ bool operator()(float value) const {
|
|
||||||
return !(common::CheckNAN(value) || value == missing);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ bool operator()(const data::COOTuple& e) const {
|
|
||||||
if (common::CheckNAN(e.value) || e.value == missing) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
__device__ bool operator()(const Entry& e) const {
|
|
||||||
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class CudfAdapterBatch : public detail::NoMetaInfo {
|
class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||||
friend class CudfAdapter;
|
friend class CudfAdapter;
|
||||||
|
|
||||||
|
|||||||
@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
|||||||
|
|
||||||
if (batches == 1) {
|
if (batches == 1) {
|
||||||
this->info_ = std::move(proxy->Info());
|
this->info_ = std::move(proxy->Info());
|
||||||
|
this->info_.num_nonzero_ = nnz;
|
||||||
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -585,6 +585,7 @@ struct GPUHistMakerDevice {
|
|||||||
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
||||||
<< "Categorical feature value too large.";
|
<< "Categorical feature value too large.";
|
||||||
auto cat = common::AsCat(candidate.split.fvalue);
|
auto cat = common::AsCat(candidate.split.fvalue);
|
||||||
|
common::CheckCat(cat);
|
||||||
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
||||||
LBitField32 cats_bits(split_cats);
|
LBitField32 cats_bits(split_cats);
|
||||||
cats_bits.Set(cat);
|
cats_bits.Set(cat);
|
||||||
|
|||||||
@ -392,6 +392,52 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
|
|||||||
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
|
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
|
||||||
|
int32_t num_bins, bool weighted) {
|
||||||
|
auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
||||||
|
thrust::device_vector<float> x(h_x);
|
||||||
|
auto adapter = AdapterFromData(x, n, 1);
|
||||||
|
MetaInfo info;
|
||||||
|
info.num_row_ = n;
|
||||||
|
info.num_col_ = 1;
|
||||||
|
info.feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||||
|
|
||||||
|
if (weighted) {
|
||||||
|
std::vector<float> weights(n, 0);
|
||||||
|
SimpleLCG lcg;
|
||||||
|
SimpleRealUniformDistribution<float> dist(0, 1);
|
||||||
|
for (auto& v : weights) {
|
||||||
|
v = dist(&lcg);
|
||||||
|
}
|
||||||
|
info.weights_.HostVector() = weights;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_EQ(info.feature_types.Size(), 1);
|
||||||
|
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
|
||||||
|
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), &container);
|
||||||
|
HistogramCuts cuts;
|
||||||
|
container.MakeCuts(&cuts);
|
||||||
|
|
||||||
|
thrust::sort(x.begin(), x.end());
|
||||||
|
auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
|
||||||
|
ASSERT_NE(n_uniques, x.size());
|
||||||
|
ASSERT_EQ(cuts.TotalBins(), n_uniques);
|
||||||
|
ASSERT_EQ(n_uniques, num_categories);
|
||||||
|
|
||||||
|
auto& values = cuts.cut_values_.HostVector();
|
||||||
|
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
|
||||||
|
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
|
||||||
|
ASSERT_TRUE(is_unique);
|
||||||
|
|
||||||
|
x.resize(n_uniques);
|
||||||
|
h_x.resize(n_uniques);
|
||||||
|
thrust::copy(x.begin(), x.end(), h_x.begin());
|
||||||
|
for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
|
||||||
|
ASSERT_EQ(h_x[i], values[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
||||||
int categorical_sizes[] = {2, 6, 8, 12};
|
int categorical_sizes[] = {2, 6, 8, 12};
|
||||||
int num_bins = 256;
|
int num_bins = 256;
|
||||||
@ -404,6 +450,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
|||||||
auto adapter = AdapterFromData(x_device, n, 1);
|
auto adapter = AdapterFromData(x_device, n, 1);
|
||||||
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
|
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
|
||||||
adapter.NumRows(), dmat.get());
|
adapter.NumRows(), dmat.get());
|
||||||
|
TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
|
||||||
|
TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -186,6 +186,37 @@ Arrow specification.'''
|
|||||||
assert len(Xy.feature_types) == X.shape[1]
|
assert len(Xy.feature_types) == X.shape[1]
|
||||||
assert all(t == "c" for t in Xy.feature_types)
|
assert all(t == "c" for t in Xy.feature_types)
|
||||||
|
|
||||||
|
# test missing value
|
||||||
|
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||||
|
X["f0"] = X["f0"].astype("category")
|
||||||
|
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
|
||||||
|
X, None, None, enable_categorical=True
|
||||||
|
)
|
||||||
|
for col in cat_codes:
|
||||||
|
assert col.has_nulls
|
||||||
|
|
||||||
|
y = [0, 1, 2]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DMatrix(X, y)
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DeviceQuantileDMatrix(X, y)
|
||||||
|
|
||||||
|
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
|
X = X["f0"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DMatrix(X, y)
|
||||||
|
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_cudf())
|
@pytest.mark.skipif(**tm.no_cudf())
|
||||||
@pytest.mark.skipif(**tm.no_cupy())
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
|||||||
@ -138,9 +138,22 @@ class TestPandas:
|
|||||||
X, enable_categorical=True
|
X, enable_categorical=True
|
||||||
)
|
)
|
||||||
|
|
||||||
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
|
|
||||||
assert transformed[:, 0].min() == 0
|
assert transformed[:, 0].min() == 0
|
||||||
|
|
||||||
|
# test missing value
|
||||||
|
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||||
|
X["f0"] = X["f0"].astype("category")
|
||||||
|
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
|
||||||
|
assert not np.any(arr == -1.0)
|
||||||
|
|
||||||
|
X = X["f0"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DMatrix(X, y)
|
||||||
|
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
def test_pandas_sparse(self):
|
def test_pandas_sparse(self):
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
rows = 100
|
rows = 100
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user