Support categorical data for dask functional interface and DQM. (#7043)

* Support categorical data for dask functional interface and DQM.

* Implement categorical data support for GPU GK-merge.
* Add support for dask functional interface.
* Add support for DQM.

* Get newer cupy.
This commit is contained in:
Jiaming Yuan
2021-06-18 13:06:52 +08:00
committed by GitHub
parent 7dd29ffd47
commit 86715e4cd4
16 changed files with 364 additions and 167 deletions

View File

@@ -321,6 +321,7 @@ class DataIter:
def __init__(self):
self._handle = _ProxyDMatrix()
self.exception = None
self.enable_categorical = False
@property
def proxy(self):
@@ -346,13 +347,12 @@ class DataIter:
data,
feature_names=None,
feature_types=None,
enable_categorical=False,
**kwargs
):
from .data import dispatch_device_quantile_dmatrix_set_data
from .data import _device_quantile_transform
data, feature_names, feature_types = _device_quantile_transform(
data, feature_names, feature_types, enable_categorical,
data, feature_names, feature_types, self.enable_categorical,
)
dispatch_device_quantile_dmatrix_set_data(self.proxy, data)
self.proxy.set_info(
@@ -1106,15 +1106,10 @@ class DeviceQuantileDMatrix(DMatrix):
data = _transform_dlpack(data)
if _is_iter(data):
it = data
if enable_categorical:
raise NotImplementedError(
"categorical support is not enabled on data iterator."
)
else:
it = SingleBatchInternalIter(
data=data, enable_categorical=enable_categorical, **meta
)
it = SingleBatchInternalIter(data=data, **meta)
it.enable_categorical = enable_categorical
reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(it.reset_wrapper)
next_callback = ctypes.CFUNCTYPE(
ctypes.c_int,

View File

@@ -182,7 +182,7 @@ def concat(value: Any) -> Any: # pylint: disable=too-many-return-statements
lazy_isinstance(value[0], 'cudf.core.series', 'Series'):
from cudf import concat as CUDF_concat # pylint: disable=import-error
return CUDF_concat(value, axis=0)
if lazy_isinstance(value[0], 'cupy.core.core', 'ndarray'):
if lazy_isinstance(value[0], 'cupy._core.core', 'ndarray'):
import cupy
# pylint: disable=c-extension-no-member,no-member
d = cupy.cuda.runtime.getDevice()
@@ -258,6 +258,7 @@ class DaskDMatrix:
self.feature_names = feature_names
self.feature_types = feature_types
self.missing = missing
self.enable_categorical = enable_categorical
if qid is not None and weight is not None:
raise NotImplementedError("per-group weight is not implemented.")
@@ -265,10 +266,6 @@ class DaskDMatrix:
raise NotImplementedError(
"group structure is not implemented, use qid instead."
)
if enable_categorical:
raise NotImplementedError(
"categorical support is not enabled on `DaskDMatrix`."
)
if len(data.shape) != 2:
raise ValueError(
@@ -311,7 +308,7 @@ class DaskDMatrix:
qid: Optional[_DaskCollection] = None,
feature_weights: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None,
label_upper_bound: Optional[_DaskCollection] = None
label_upper_bound: Optional[_DaskCollection] = None,
) -> "DaskDMatrix":
'''Obtain references to local data.'''
@@ -430,6 +427,7 @@ class DaskDMatrix:
'feature_weights': self.feature_weights,
'meta_names': self.meta_names,
'missing': self.missing,
'enable_categorical': self.enable_categorical,
'parts': self.worker_map.get(worker_addr, None),
'is_quantile': self.is_quantile}
@@ -668,6 +666,7 @@ def _create_device_quantile_dmatrix(
missing: float,
parts: Optional[_DataParts],
max_bin: int,
enable_categorical: bool,
) -> DeviceQuantileDMatrix:
worker = distributed.get_worker()
if parts is None:
@@ -680,6 +679,7 @@ def _create_device_quantile_dmatrix(
feature_names=feature_names,
feature_types=feature_types,
max_bin=max_bin,
enable_categorical=enable_categorical,
)
return d
@@ -709,6 +709,7 @@ def _create_device_quantile_dmatrix(
feature_types=feature_types,
nthread=worker.nthreads,
max_bin=max_bin,
enable_categorical=enable_categorical,
)
dmatrix.set_info(feature_weights=feature_weights)
return dmatrix
@@ -720,6 +721,7 @@ def _create_dmatrix(
feature_weights: Optional[Any],
meta_names: List[str],
missing: float,
enable_categorical: bool,
parts: Optional[_DataParts]
) -> DMatrix:
'''Get data that local to worker from DaskDMatrix.
@@ -734,9 +736,12 @@ def _create_dmatrix(
if list_of_parts is None:
msg = 'worker {address} has an empty DMatrix. '.format(address=worker.address)
LOGGER.warning(msg)
d = DMatrix(numpy.empty((0, 0)),
feature_names=feature_names,
feature_types=feature_types)
d = DMatrix(
numpy.empty((0, 0)),
feature_names=feature_names,
feature_types=feature_types,
enable_categorical=enable_categorical,
)
return d
T = TypeVar('T')
@@ -764,6 +769,7 @@ def _create_dmatrix(
feature_names=feature_names,
feature_types=feature_types,
nthread=worker.nthreads,
enable_categorical=enable_categorical,
)
dmatrix.set_info(
base_margin=_base_margin,