Support categorical data for dask functional interface and DQM. (#7043)
* Support categorical data for dask functional interface and DQM. * Implement categorical data support for GPU GK-merge. * Add support for dask functional interface. * Add support for DQM. * Get newer cupy.
This commit is contained in:
@@ -321,6 +321,7 @@ class DataIter:
|
||||
def __init__(self):
|
||||
self._handle = _ProxyDMatrix()
|
||||
self.exception = None
|
||||
self.enable_categorical = False
|
||||
|
||||
@property
|
||||
def proxy(self):
|
||||
@@ -346,13 +347,12 @@ class DataIter:
|
||||
data,
|
||||
feature_names=None,
|
||||
feature_types=None,
|
||||
enable_categorical=False,
|
||||
**kwargs
|
||||
):
|
||||
from .data import dispatch_device_quantile_dmatrix_set_data
|
||||
from .data import _device_quantile_transform
|
||||
data, feature_names, feature_types = _device_quantile_transform(
|
||||
data, feature_names, feature_types, enable_categorical,
|
||||
data, feature_names, feature_types, self.enable_categorical,
|
||||
)
|
||||
dispatch_device_quantile_dmatrix_set_data(self.proxy, data)
|
||||
self.proxy.set_info(
|
||||
@@ -1106,15 +1106,10 @@ class DeviceQuantileDMatrix(DMatrix):
|
||||
data = _transform_dlpack(data)
|
||||
if _is_iter(data):
|
||||
it = data
|
||||
if enable_categorical:
|
||||
raise NotImplementedError(
|
||||
"categorical support is not enabled on data iterator."
|
||||
)
|
||||
else:
|
||||
it = SingleBatchInternalIter(
|
||||
data=data, enable_categorical=enable_categorical, **meta
|
||||
)
|
||||
it = SingleBatchInternalIter(data=data, **meta)
|
||||
|
||||
it.enable_categorical = enable_categorical
|
||||
reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(it.reset_wrapper)
|
||||
next_callback = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int,
|
||||
|
||||
@@ -182,7 +182,7 @@ def concat(value: Any) -> Any: # pylint: disable=too-many-return-statements
|
||||
lazy_isinstance(value[0], 'cudf.core.series', 'Series'):
|
||||
from cudf import concat as CUDF_concat # pylint: disable=import-error
|
||||
return CUDF_concat(value, axis=0)
|
||||
if lazy_isinstance(value[0], 'cupy.core.core', 'ndarray'):
|
||||
if lazy_isinstance(value[0], 'cupy._core.core', 'ndarray'):
|
||||
import cupy
|
||||
# pylint: disable=c-extension-no-member,no-member
|
||||
d = cupy.cuda.runtime.getDevice()
|
||||
@@ -258,6 +258,7 @@ class DaskDMatrix:
|
||||
self.feature_names = feature_names
|
||||
self.feature_types = feature_types
|
||||
self.missing = missing
|
||||
self.enable_categorical = enable_categorical
|
||||
|
||||
if qid is not None and weight is not None:
|
||||
raise NotImplementedError("per-group weight is not implemented.")
|
||||
@@ -265,10 +266,6 @@ class DaskDMatrix:
|
||||
raise NotImplementedError(
|
||||
"group structure is not implemented, use qid instead."
|
||||
)
|
||||
if enable_categorical:
|
||||
raise NotImplementedError(
|
||||
"categorical support is not enabled on `DaskDMatrix`."
|
||||
)
|
||||
|
||||
if len(data.shape) != 2:
|
||||
raise ValueError(
|
||||
@@ -311,7 +308,7 @@ class DaskDMatrix:
|
||||
qid: Optional[_DaskCollection] = None,
|
||||
feature_weights: Optional[_DaskCollection] = None,
|
||||
label_lower_bound: Optional[_DaskCollection] = None,
|
||||
label_upper_bound: Optional[_DaskCollection] = None
|
||||
label_upper_bound: Optional[_DaskCollection] = None,
|
||||
) -> "DaskDMatrix":
|
||||
'''Obtain references to local data.'''
|
||||
|
||||
@@ -430,6 +427,7 @@ class DaskDMatrix:
|
||||
'feature_weights': self.feature_weights,
|
||||
'meta_names': self.meta_names,
|
||||
'missing': self.missing,
|
||||
'enable_categorical': self.enable_categorical,
|
||||
'parts': self.worker_map.get(worker_addr, None),
|
||||
'is_quantile': self.is_quantile}
|
||||
|
||||
@@ -668,6 +666,7 @@ def _create_device_quantile_dmatrix(
|
||||
missing: float,
|
||||
parts: Optional[_DataParts],
|
||||
max_bin: int,
|
||||
enable_categorical: bool,
|
||||
) -> DeviceQuantileDMatrix:
|
||||
worker = distributed.get_worker()
|
||||
if parts is None:
|
||||
@@ -680,6 +679,7 @@ def _create_device_quantile_dmatrix(
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
max_bin=max_bin,
|
||||
enable_categorical=enable_categorical,
|
||||
)
|
||||
return d
|
||||
|
||||
@@ -709,6 +709,7 @@ def _create_device_quantile_dmatrix(
|
||||
feature_types=feature_types,
|
||||
nthread=worker.nthreads,
|
||||
max_bin=max_bin,
|
||||
enable_categorical=enable_categorical,
|
||||
)
|
||||
dmatrix.set_info(feature_weights=feature_weights)
|
||||
return dmatrix
|
||||
@@ -720,6 +721,7 @@ def _create_dmatrix(
|
||||
feature_weights: Optional[Any],
|
||||
meta_names: List[str],
|
||||
missing: float,
|
||||
enable_categorical: bool,
|
||||
parts: Optional[_DataParts]
|
||||
) -> DMatrix:
|
||||
'''Get data that local to worker from DaskDMatrix.
|
||||
@@ -734,9 +736,12 @@ def _create_dmatrix(
|
||||
if list_of_parts is None:
|
||||
msg = 'worker {address} has an empty DMatrix. '.format(address=worker.address)
|
||||
LOGGER.warning(msg)
|
||||
d = DMatrix(numpy.empty((0, 0)),
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types)
|
||||
d = DMatrix(
|
||||
numpy.empty((0, 0)),
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
enable_categorical=enable_categorical,
|
||||
)
|
||||
return d
|
||||
|
||||
T = TypeVar('T')
|
||||
@@ -764,6 +769,7 @@ def _create_dmatrix(
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
nthread=worker.nthreads,
|
||||
enable_categorical=enable_categorical,
|
||||
)
|
||||
dmatrix.set_info(
|
||||
base_margin=_base_margin,
|
||||
|
||||
Reference in New Issue
Block a user