@@ -300,6 +300,99 @@ def _cudf_array_interfaces(df):
|
||||
return interfaces_str
|
||||
|
||||
|
||||
class DataIter:
|
||||
'''The interface for user defined data iterator. Currently is only
|
||||
supported by Device DMatrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
rows : int
|
||||
Total number of rows combining all batches.
|
||||
cols : int
|
||||
Number of columns for each batch.
|
||||
'''
|
||||
def __init__(self):
|
||||
proxy_handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(proxy_handle)))
|
||||
self._handle = DeviceQuantileDMatrix(proxy_handle)
|
||||
self.exception = None
|
||||
|
||||
@property
|
||||
def proxy(self):
|
||||
'''Handler of DMatrix proxy.'''
|
||||
return self._handle
|
||||
|
||||
def reset_wrapper(self, this): # pylint: disable=unused-argument
|
||||
'''A wrapper for user defined `reset` function.'''
|
||||
self.reset()
|
||||
|
||||
def next_wrapper(self, this): # pylint: disable=unused-argument
|
||||
'''A wrapper for user defined `next` function.
|
||||
|
||||
`this` is not used in Python. ctypes can handle `self` of a Python
|
||||
member function automatically when converting a it to c function
|
||||
pointer.
|
||||
|
||||
'''
|
||||
if self.exception is not None:
|
||||
return 0
|
||||
|
||||
def data_handle(data, label=None, weight=None, base_margin=None,
|
||||
group=None,
|
||||
label_lower_bound=None, label_upper_bound=None):
|
||||
if lazy_isinstance(data, 'cudf.core.dataframe', 'DataFrame'):
|
||||
# pylint: disable=protected-access
|
||||
self.proxy._set_data_from_cuda_columnar(data)
|
||||
elif lazy_isinstance(data, 'cudf.core.series', 'Series'):
|
||||
# pylint: disable=protected-access
|
||||
self.proxy._set_data_from_cuda_columnar(data)
|
||||
elif lazy_isinstance(data, 'cupy.core.core', 'ndarray'):
|
||||
# pylint: disable=protected-access
|
||||
self.proxy._set_data_from_cuda_interface(data)
|
||||
else:
|
||||
raise TypeError(
|
||||
'Value type is not supported for data iterator:' +
|
||||
str(type(self._handle)), type(data))
|
||||
self.proxy.set_info(label=label, weight=weight,
|
||||
base_margin=base_margin,
|
||||
group=group,
|
||||
label_lower_bound=label_lower_bound,
|
||||
label_upper_bound=label_upper_bound)
|
||||
try:
|
||||
# Deffer the exception in order to return 0 and stop the iteration.
|
||||
# Exception inside a ctype callback function has no effect except
|
||||
# for printing to stderr (doesn't stop the execution).
|
||||
ret = self.next(data_handle) # pylint: disable=not-callable
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
tb = sys.exc_info()[2]
|
||||
print('Got an exception in Python')
|
||||
self.exception = e.with_traceback(tb)
|
||||
return 0
|
||||
return ret
|
||||
|
||||
def reset(self):
|
||||
'''Reset the data iterator. Prototype for user defined function.'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def next(self, input_data):
|
||||
'''Set the next batch of data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
data_handle: callable
|
||||
A function with same data fields like `data`, `label` with
|
||||
`xgboost.DMatrix`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
0 if there's no more batch, otherwise 1.
|
||||
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
"""Data Matrix used in XGBoost.
|
||||
|
||||
@@ -361,36 +454,65 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
self.handle = None
|
||||
return
|
||||
|
||||
handler = self.get_data_handler(data)
|
||||
handler = self._get_data_handler(data)
|
||||
can_handle_meta = False
|
||||
if handler is None:
|
||||
data = _convert_unknown_data(data, None)
|
||||
handler = self.get_data_handler(data)
|
||||
handler = self._get_data_handler(data)
|
||||
try:
|
||||
handler.handle_meta(label, weight, base_margin)
|
||||
can_handle_meta = True
|
||||
except NotImplementedError:
|
||||
can_handle_meta = False
|
||||
|
||||
self.handle, feature_names, feature_types = handler.handle_input(
|
||||
data, feature_names, feature_types)
|
||||
assert self.handle, 'Failed to construct a DMatrix.'
|
||||
|
||||
if label is not None:
|
||||
self.set_label(label)
|
||||
if weight is not None:
|
||||
self.set_weight(weight)
|
||||
if base_margin is not None:
|
||||
self.set_base_margin(base_margin)
|
||||
if not can_handle_meta:
|
||||
self.set_info(label, weight, base_margin)
|
||||
|
||||
self.feature_names = feature_names
|
||||
self.feature_types = feature_types
|
||||
|
||||
def get_data_handler(self, data, meta=None, meta_type=None):
|
||||
def _get_data_handler(self, data, meta=None, meta_type=None):
|
||||
'''Get data handler for this DMatrix class.'''
|
||||
from .data import get_dmatrix_data_handler
|
||||
handler = get_dmatrix_data_handler(
|
||||
data, self.missing, self.nthread, self.silent, meta, meta_type)
|
||||
return handler
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def _get_meta_handler(self, data, meta, meta_type):
|
||||
from .data import get_dmatrix_meta_handler
|
||||
handler = get_dmatrix_meta_handler(
|
||||
data, meta, meta_type)
|
||||
return handler
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "handle") and self.handle:
|
||||
_check_call(_LIB.XGDMatrixFree(self.handle))
|
||||
self.handle = None
|
||||
|
||||
def set_info(self,
|
||||
label=None, weight=None, base_margin=None,
|
||||
group=None,
|
||||
label_lower_bound=None,
|
||||
label_upper_bound=None):
|
||||
'''Set meta info for DMatrix.'''
|
||||
if label is not None:
|
||||
self.set_label(label)
|
||||
if weight is not None:
|
||||
self.set_weight(weight)
|
||||
if base_margin is not None:
|
||||
self.set_base_margin(base_margin)
|
||||
if group is not None:
|
||||
self.set_group(group)
|
||||
if label_lower_bound is not None:
|
||||
self.set_float_info('label_lower_bound', label_lower_bound)
|
||||
if label_upper_bound is not None:
|
||||
self.set_float_info('label_upper_bound', label_upper_bound)
|
||||
|
||||
def get_float_info(self, field):
|
||||
"""Get float property from the DMatrix.
|
||||
|
||||
@@ -447,10 +569,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
if isinstance(data, np.ndarray):
|
||||
self.set_float_info_npy2d(field, data)
|
||||
return
|
||||
handler = self.get_data_handler(data, field, np.float32)
|
||||
if handler is None:
|
||||
data = _convert_unknown_data(data, field, np.float32)
|
||||
handler = self.get_data_handler(data, field, np.float32)
|
||||
handler = self._get_data_handler(data, field, np.float32)
|
||||
assert handler
|
||||
data, _, _ = handler.transform(data)
|
||||
c_data = c_array(ctypes.c_float, data)
|
||||
_check_call(_LIB.XGDMatrixSetFloatInfo(self.handle,
|
||||
@@ -470,7 +590,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
data: numpy array
|
||||
The array of data to be set
|
||||
"""
|
||||
data, _, _ = self.get_data_handler(
|
||||
data, _, _ = self._get_meta_handler(
|
||||
data, field, np.float32).transform(data)
|
||||
c_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
||||
_check_call(_LIB.XGDMatrixSetFloatInfo(self.handle,
|
||||
@@ -489,7 +609,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
data: numpy array
|
||||
The array of data to be set
|
||||
"""
|
||||
data, _, _ = self.get_data_handler(
|
||||
data, _, _ = self._get_data_handler(
|
||||
data, field, 'uint32').transform(data)
|
||||
_check_call(_LIB.XGDMatrixSetUIntInfo(self.handle,
|
||||
c_str(field),
|
||||
@@ -803,11 +923,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
|
||||
|
||||
class DeviceQuantileDMatrix(DMatrix):
|
||||
"""Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do not
|
||||
use this for test/validation tasks as some information may be lost in quantisation. This
|
||||
DMatrix is primarily designed to save memory in training from device memory inputs by
|
||||
avoiding intermediate storage. Implementation does not currently consider weights in
|
||||
quantisation process(unlike DMatrix). Set max_bin to control the number of bins during
|
||||
"""Device memory Data Matrix used in XGBoost for training with
|
||||
tree_method='gpu_hist'. Do not use this for test/validation tasks as some
|
||||
information may be lost in quantisation. This DMatrix is primarily designed
|
||||
to save memory in training from device memory inputs by avoiding
|
||||
intermediate storage. Set max_bin to control the number of bins during
|
||||
quantisation.
|
||||
|
||||
You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
|
||||
@@ -823,6 +943,9 @@ class DeviceQuantileDMatrix(DMatrix):
|
||||
feature_types=None,
|
||||
nthread=None, max_bin=256):
|
||||
self.max_bin = max_bin
|
||||
if isinstance(data, ctypes.c_void_p):
|
||||
self.handle = data
|
||||
return
|
||||
super().__init__(data, label=label, weight=weight,
|
||||
base_margin=base_margin,
|
||||
missing=missing,
|
||||
@@ -831,11 +954,32 @@ class DeviceQuantileDMatrix(DMatrix):
|
||||
feature_types=feature_types,
|
||||
nthread=nthread)
|
||||
|
||||
def get_data_handler(self, data, meta=None, meta_type=None):
|
||||
def _get_data_handler(self, data, meta=None, meta_type=None):
|
||||
from .data import get_device_quantile_dmatrix_data_handler
|
||||
return get_device_quantile_dmatrix_data_handler(
|
||||
data, self.max_bin, self.missing, self.nthread, self.silent)
|
||||
|
||||
def _set_data_from_cuda_interface(self, data):
|
||||
'''Set data from CUDA array interface.'''
|
||||
interface = data.__cuda_array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
|
||||
_check_call(
|
||||
_LIB.XGDeviceQuantileDMatrixSetDataCudaArrayInterface(
|
||||
self.handle,
|
||||
interface_str
|
||||
)
|
||||
)
|
||||
|
||||
def _set_data_from_cuda_columnar(self, data):
|
||||
'''Set data from CUDA columnar format.1'''
|
||||
interfaces_str = _cudf_array_interfaces(data)
|
||||
_check_call(
|
||||
_LIB.XGDeviceQuantileDMatrixSetDataCudaColumnar(
|
||||
self.handle,
|
||||
interfaces_str
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class Booster(object):
|
||||
# pylint: disable=too-many-public-methods
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# pylint: disable=too-many-arguments, no-self-use
|
||||
# pylint: disable=too-many-arguments, no-self-use, too-many-instance-attributes
|
||||
'''Data dispatching for DMatrix.'''
|
||||
import ctypes
|
||||
import abc
|
||||
@@ -8,6 +8,7 @@ import warnings
|
||||
import numpy as np
|
||||
|
||||
from .core import c_array, _LIB, _check_call, c_str, _cudf_array_interfaces
|
||||
from .core import DataIter
|
||||
from .compat import lazy_isinstance, STRING_TYPES, os_fspath, os_PathLike
|
||||
|
||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
@@ -23,6 +24,18 @@ class DataHandler(abc.ABC):
|
||||
self.meta = meta
|
||||
self.meta_type = meta_type
|
||||
|
||||
def handle_meta(self, label=None, weight=None, base_margin=None,
|
||||
group=None,
|
||||
label_lower_bound=None,
|
||||
label_upper_bound=None):
|
||||
'''Handle meta data when the DMatrix type can not defer setting meta
|
||||
data after construction. Example is `DeviceQuantileDMatrix`
|
||||
which requires weight to be presented before digesting
|
||||
data.
|
||||
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def _warn_unused_missing(self, data):
|
||||
if not (np.isnan(np.nan) or None):
|
||||
warnings.warn(
|
||||
@@ -116,6 +129,14 @@ def get_dmatrix_data_handler(data, missing, nthread, silent,
|
||||
return handler(missing, nthread, silent, meta, meta_type)
|
||||
|
||||
|
||||
def get_dmatrix_meta_handler(data, meta, meta_type):
|
||||
'''Get handler for meta instead of data.'''
|
||||
handler = __dmatrix_registry.get_handler(data)
|
||||
if handler is None:
|
||||
return None
|
||||
return handler(None, 0, True, meta, meta_type)
|
||||
|
||||
|
||||
class FileHandler(DataHandler):
|
||||
'''Handler of path like input.'''
|
||||
def handle_input(self, data, feature_names, feature_types):
|
||||
@@ -511,6 +532,43 @@ __dmatrix_registry.register_handler_opaque(
|
||||
DLPackHandler)
|
||||
|
||||
|
||||
class SingleBatchInternalIter(DataIter):
|
||||
'''An iterator for single batch data to help creating device DMatrix.
|
||||
Transforming input directly to histogram with normal single batch data API
|
||||
can not access weight for sketching. So this iterator acts as a staging
|
||||
area for meta info.
|
||||
|
||||
'''
|
||||
def __init__(self, data, label, weight, base_margin, group,
|
||||
label_lower_bound, label_upper_bound):
|
||||
self.data = data
|
||||
self.label = label
|
||||
self.weight = weight
|
||||
self.base_margin = base_margin
|
||||
self.group = group
|
||||
self.label_lower_bound = label_lower_bound
|
||||
self.label_upper_bound = label_upper_bound
|
||||
self.it = 0 # pylint: disable=invalid-name
|
||||
super().__init__()
|
||||
|
||||
def next(self, input_data):
|
||||
if self.it == 1:
|
||||
return 0
|
||||
self.it += 1
|
||||
input_data(data=self.data, label=self.label,
|
||||
weight=self.weight, base_margin=self.base_margin,
|
||||
group=self.group,
|
||||
label_lower_bound=self.label_lower_bound,
|
||||
label_upper_bound=self.label_upper_bound)
|
||||
return 1
|
||||
|
||||
def reset(self):
|
||||
self.it = 0
|
||||
|
||||
|
||||
__device_quantile_dmatrix_registry = DMatrixDataManager() # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class DeviceQuantileDMatrixDataHandler(DataHandler): # pylint: disable=abstract-method
|
||||
'''Base class of data handler for `DeviceQuantileDMatrix`.'''
|
||||
def __init__(self, max_bin, missing, nthread, silent,
|
||||
@@ -518,8 +576,53 @@ class DeviceQuantileDMatrixDataHandler(DataHandler): # pylint: disable=abstract
|
||||
self.max_bin = max_bin
|
||||
super().__init__(missing, nthread, silent, meta, meta_type)
|
||||
|
||||
def handle_meta(self, label=None, weight=None, base_margin=None,
|
||||
group=None,
|
||||
label_lower_bound=None,
|
||||
label_upper_bound=None):
|
||||
self.label = label
|
||||
self.weight = weight
|
||||
self.base_margin = base_margin
|
||||
self.group = group
|
||||
self.label_lower_bound = label_lower_bound
|
||||
self.label_upper_bound = label_upper_bound
|
||||
|
||||
__device_quantile_dmatrix_registry = DMatrixDataManager() # pylint: disable=invalid-name
|
||||
def handle_input(self, data, feature_names, feature_types):
|
||||
if not isinstance(data, DataIter):
|
||||
it = SingleBatchInternalIter(
|
||||
data, self.label, self.weight,
|
||||
self.base_margin, self.group,
|
||||
self.label_lower_bound, self.label_upper_bound)
|
||||
else:
|
||||
it = data
|
||||
reset_factory = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
||||
reset_callback = reset_factory(it.reset_wrapper)
|
||||
next_factory = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int,
|
||||
ctypes.c_void_p,
|
||||
)
|
||||
next_callback = next_factory(it.next_wrapper)
|
||||
handle = ctypes.c_void_p()
|
||||
ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
|
||||
None,
|
||||
it.proxy.handle,
|
||||
reset_callback,
|
||||
next_callback,
|
||||
ctypes.c_float(self.missing),
|
||||
ctypes.c_int(self.nthread),
|
||||
ctypes.c_int(self.max_bin),
|
||||
ctypes.byref(handle)
|
||||
)
|
||||
if it.exception:
|
||||
raise it.exception
|
||||
# delay check_call to throw intermediate exception first
|
||||
_check_call(ret)
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
__device_quantile_dmatrix_registry.register_handler_opaque(
|
||||
lambda x: isinstance(x, DataIter),
|
||||
DeviceQuantileDMatrixDataHandler)
|
||||
|
||||
|
||||
def get_device_quantile_dmatrix_data_handler(
|
||||
@@ -549,19 +652,7 @@ class DeviceQuantileCudaArrayInterfaceHandler(
|
||||
data, '__array__'):
|
||||
import cupy # pylint: disable=import-error
|
||||
data = cupy.array(data, copy=False)
|
||||
|
||||
interface = data.__cuda_array_interface__
|
||||
if 'mask' in interface:
|
||||
interface['mask'] = interface['mask'].__cuda_array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
|
||||
|
||||
handle = ctypes.c_void_p()
|
||||
_check_call(
|
||||
_LIB.XGDeviceQuantileDMatrixCreateFromArrayInterface(
|
||||
interface_str,
|
||||
ctypes.c_float(self.missing), ctypes.c_int(self.nthread),
|
||||
ctypes.c_int(self.max_bin), ctypes.byref(handle)))
|
||||
return handle, feature_names, feature_types
|
||||
return super().handle_input(data, feature_names, feature_types)
|
||||
|
||||
|
||||
__device_quantile_dmatrix_registry.register_handler(
|
||||
@@ -582,14 +673,7 @@ class DeviceQuantileCudaColumnarHandler(DeviceQuantileDMatrixDataHandler,
|
||||
"""Initialize Quantile Device DMatrix from columnar memory format."""
|
||||
data, feature_names, feature_types = self._maybe_cudf_dataframe(
|
||||
data, feature_names, feature_types)
|
||||
interfaces_str = _cudf_array_interfaces(data)
|
||||
handle = ctypes.c_void_p()
|
||||
_check_call(
|
||||
_LIB.XGDeviceQuantileDMatrixCreateFromArrayInterfaceColumns(
|
||||
interfaces_str,
|
||||
ctypes.c_float(self.missing), ctypes.c_int(self.nthread),
|
||||
ctypes.c_int(self.max_bin), ctypes.byref(handle)))
|
||||
return handle, feature_names, feature_types
|
||||
return super().handle_input(data, feature_names, feature_types)
|
||||
|
||||
|
||||
__device_quantile_dmatrix_registry.register_handler(
|
||||
|
||||
Reference in New Issue
Block a user