Enhance inplace prediction. (#6653)
* Accept array interface for csr and array. * Accept an optional proxy dmatrix for metainfo. This constructs an explicit `_ProxyDMatrix` type in Python. * Remove unused doc. * Add strict output.
This commit is contained in:
@@ -58,21 +58,23 @@ CallbackEnv = collections.namedtuple(
|
||||
"evaluation_result_list"])
|
||||
|
||||
|
||||
def from_pystr_to_cstr(data):
|
||||
"""Convert a list of Python str to C pointer
|
||||
def from_pystr_to_cstr(data: Union[str, List[str]]):
|
||||
"""Convert a Python str or list of Python str to C pointer
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : list
|
||||
list of str
|
||||
data
|
||||
str or list of str
|
||||
"""
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise NotImplementedError
|
||||
pointers = (ctypes.c_char_p * len(data))()
|
||||
data = [bytes(d, 'utf-8') for d in data]
|
||||
pointers[:] = data
|
||||
return pointers
|
||||
if isinstance(data, str):
|
||||
return bytes(data, "utf-8")
|
||||
if isinstance(data, list):
|
||||
pointers = (ctypes.c_char_p * len(data))()
|
||||
data = [bytes(d, 'utf-8') for d in data]
|
||||
pointers[:] = data
|
||||
return pointers
|
||||
raise TypeError()
|
||||
|
||||
|
||||
def from_cstr_to_pystr(data, length):
|
||||
@@ -190,21 +192,40 @@ def _check_call(ret):
|
||||
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
|
||||
|
||||
|
||||
def ctypes2numpy(cptr, length, dtype) -> np.ndarray:
|
||||
"""Convert a ctypes pointer array to a numpy array."""
|
||||
NUMPY_TO_CTYPES_MAPPING = {
|
||||
def _numpy2ctypes_type(dtype):
|
||||
_NUMPY_TO_CTYPES_MAPPING = {
|
||||
np.float32: ctypes.c_float,
|
||||
np.float64: ctypes.c_double,
|
||||
np.uint32: ctypes.c_uint,
|
||||
np.uint64: ctypes.c_uint64,
|
||||
np.int32: ctypes.c_int32,
|
||||
np.int64: ctypes.c_int64,
|
||||
}
|
||||
if dtype not in NUMPY_TO_CTYPES_MAPPING:
|
||||
raise RuntimeError('Supported types: {}'.format(
|
||||
NUMPY_TO_CTYPES_MAPPING.keys()))
|
||||
ctype = NUMPY_TO_CTYPES_MAPPING[dtype]
|
||||
if np.intc is not np.int32: # Windows
|
||||
_NUMPY_TO_CTYPES_MAPPING[np.intc] = _NUMPY_TO_CTYPES_MAPPING[np.int32]
|
||||
if dtype not in _NUMPY_TO_CTYPES_MAPPING.keys():
|
||||
raise TypeError(
|
||||
f"Supported types: {_NUMPY_TO_CTYPES_MAPPING.keys()}, got: {dtype}"
|
||||
)
|
||||
return _NUMPY_TO_CTYPES_MAPPING[dtype]
|
||||
|
||||
|
||||
def _array_interface(data: np.ndarray) -> bytes:
|
||||
interface = data.__array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
||||
return interface_str
|
||||
|
||||
|
||||
def ctypes2numpy(cptr, length, dtype):
|
||||
"""Convert a ctypes pointer array to a numpy array."""
|
||||
ctype = _numpy2ctypes_type(dtype)
|
||||
if not isinstance(cptr, ctypes.POINTER(ctype)):
|
||||
raise RuntimeError('expected {} pointer'.format(ctype))
|
||||
raise RuntimeError("expected {} pointer".format(ctype))
|
||||
res = np.zeros(length, dtype=dtype)
|
||||
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
|
||||
raise RuntimeError('memmove failed')
|
||||
raise RuntimeError("memmove failed")
|
||||
return res
|
||||
|
||||
|
||||
@@ -214,25 +235,21 @@ def ctypes2cupy(cptr, length, dtype):
|
||||
import cupy
|
||||
from cupy.cuda.memory import MemoryPointer
|
||||
from cupy.cuda.memory import UnownedMemory
|
||||
CUPY_TO_CTYPES_MAPPING = {
|
||||
cupy.float32: ctypes.c_float,
|
||||
cupy.uint32: ctypes.c_uint
|
||||
}
|
||||
|
||||
CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint}
|
||||
if dtype not in CUPY_TO_CTYPES_MAPPING.keys():
|
||||
raise RuntimeError('Supported types: {}'.format(
|
||||
CUPY_TO_CTYPES_MAPPING.keys()
|
||||
))
|
||||
raise RuntimeError("Supported types: {}".format(CUPY_TO_CTYPES_MAPPING.keys()))
|
||||
addr = ctypes.cast(cptr, ctypes.c_void_p).value
|
||||
# pylint: disable=c-extension-no-member,no-member
|
||||
device = cupy.cuda.runtime.pointerGetAttributes(addr).device
|
||||
# The owner field is just used to keep the memory alive with ref count. As
|
||||
# unowned's life time is scoped within this function we don't need that.
|
||||
unownd = UnownedMemory(
|
||||
addr, length.value * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]),
|
||||
owner=None)
|
||||
addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None
|
||||
)
|
||||
memptr = MemoryPointer(unownd, 0)
|
||||
# pylint: disable=unexpected-keyword-arg
|
||||
mem = cupy.ndarray((length.value, ), dtype=dtype, memptr=memptr)
|
||||
mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr)
|
||||
assert mem.device.id == device
|
||||
arr = cupy.array(mem, copy=True)
|
||||
return arr
|
||||
@@ -256,28 +273,29 @@ def c_str(string):
|
||||
|
||||
def c_array(ctype, values):
|
||||
"""Convert a python string to c array."""
|
||||
if (isinstance(values, np.ndarray)
|
||||
and values.dtype.itemsize == ctypes.sizeof(ctype)):
|
||||
if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
|
||||
return (ctype * len(values)).from_buffer_copy(values)
|
||||
return (ctype * len(values))(*values)
|
||||
|
||||
|
||||
def _prediction_output(shape, dims, predts, is_cuda):
|
||||
arr_shape: np.ndarray = ctypes2numpy(shape, dims.value, np.uint64)
|
||||
length = int(np.prod(arr_shape))
|
||||
if is_cuda:
|
||||
arr_predict = ctypes2cupy(predts, length, np.float32)
|
||||
else:
|
||||
arr_predict: np.ndarray = ctypes2numpy(predts, length, np.float32)
|
||||
arr_predict = arr_predict.reshape(arr_shape)
|
||||
return arr_predict
|
||||
|
||||
|
||||
class DataIter:
|
||||
'''The interface for user defined data iterator. Currently is only
|
||||
supported by Device DMatrix.
|
||||
'''The interface for user defined data iterator. Currently is only supported by Device
|
||||
DMatrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
rows : int
|
||||
Total number of rows combining all batches.
|
||||
cols : int
|
||||
Number of columns for each batch.
|
||||
'''
|
||||
def __init__(self):
|
||||
proxy_handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(proxy_handle)))
|
||||
self._handle = DeviceQuantileDMatrix(proxy_handle)
|
||||
self._handle = _ProxyDMatrix()
|
||||
self.exception = None
|
||||
|
||||
@property
|
||||
@@ -300,12 +318,7 @@ class DataIter:
|
||||
if self.exception is not None:
|
||||
return 0
|
||||
|
||||
def data_handle(data, label=None, weight=None, base_margin=None,
|
||||
group=None,
|
||||
qid=None,
|
||||
label_lower_bound=None, label_upper_bound=None,
|
||||
feature_names=None, feature_types=None,
|
||||
feature_weights=None):
|
||||
def data_handle(data, feature_names=None, feature_types=None, **kwargs):
|
||||
from .data import dispatch_device_quantile_dmatrix_set_data
|
||||
from .data import _device_quantile_transform
|
||||
data, feature_names, feature_types = _device_quantile_transform(
|
||||
@@ -313,16 +326,9 @@ class DataIter:
|
||||
)
|
||||
dispatch_device_quantile_dmatrix_set_data(self.proxy, data)
|
||||
self.proxy.set_info(
|
||||
label=label,
|
||||
weight=weight,
|
||||
base_margin=base_margin,
|
||||
group=group,
|
||||
qid=qid,
|
||||
label_lower_bound=label_lower_bound,
|
||||
label_upper_bound=label_upper_bound,
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
feature_weights=feature_weights
|
||||
**kwargs,
|
||||
)
|
||||
try:
|
||||
# Differ the exception in order to return 0 and stop the iteration.
|
||||
@@ -558,7 +564,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
feature_types=None,
|
||||
feature_weights=None
|
||||
) -> None:
|
||||
"""Set meta info for DMatrix. See doc string for DMatrix constructor."""
|
||||
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
|
||||
from .data import dispatch_meta_backend
|
||||
|
||||
if label is not None:
|
||||
@@ -959,76 +965,14 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
c_bst_ulong(0)))
|
||||
|
||||
|
||||
class DeviceQuantileDMatrix(DMatrix):
|
||||
"""Device memory Data Matrix used in XGBoost for training with
|
||||
tree_method='gpu_hist'. Do not use this for test/validation tasks as some
|
||||
information may be lost in quantisation. This DMatrix is primarily designed
|
||||
to save memory in training from device memory inputs by avoiding
|
||||
intermediate storage. Set max_bin to control the number of bins during
|
||||
quantisation. See doc string in `DMatrix` for documents on meta info.
|
||||
class _ProxyDMatrix(DMatrix):
|
||||
"""A placeholder class when DMatrix cannot be constructed (DeviceQuantileDMatrix,
|
||||
inplace_predict).
|
||||
|
||||
You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__( # pylint: disable=super-init-not-called
|
||||
self,
|
||||
data,
|
||||
label=None,
|
||||
*,
|
||||
weight=None,
|
||||
base_margin=None,
|
||||
missing=None,
|
||||
silent=False,
|
||||
feature_names=None,
|
||||
feature_types=None,
|
||||
nthread: Optional[int] = None,
|
||||
max_bin: int = 256,
|
||||
group=None,
|
||||
qid=None,
|
||||
label_lower_bound=None,
|
||||
label_upper_bound=None,
|
||||
feature_weights=None,
|
||||
enable_categorical: bool = False,
|
||||
):
|
||||
self.max_bin = max_bin
|
||||
self.missing = missing if missing is not None else np.nan
|
||||
self.nthread = nthread if nthread is not None else 1
|
||||
self._silent = silent # unused, kept for compatibility
|
||||
|
||||
if isinstance(data, ctypes.c_void_p):
|
||||
self.handle = data
|
||||
return
|
||||
from .data import init_device_quantile_dmatrix
|
||||
handle, feature_names, feature_types = init_device_quantile_dmatrix(
|
||||
data,
|
||||
label=label, weight=weight,
|
||||
base_margin=base_margin,
|
||||
group=group,
|
||||
qid=qid,
|
||||
missing=self.missing,
|
||||
label_lower_bound=label_lower_bound,
|
||||
label_upper_bound=label_upper_bound,
|
||||
feature_weights=feature_weights,
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
threads=self.nthread,
|
||||
max_bin=self.max_bin,
|
||||
)
|
||||
if enable_categorical:
|
||||
raise NotImplementedError(
|
||||
'categorical support is not enabled on DeviceQuantileDMatrix.'
|
||||
)
|
||||
self.handle = handle
|
||||
if qid is not None and group is not None:
|
||||
raise ValueError(
|
||||
'Only one of the eval_qid or eval_group for each evaluation '
|
||||
'dataset should be provided.'
|
||||
)
|
||||
|
||||
self.feature_names = feature_names
|
||||
self.feature_types = feature_types
|
||||
def __init__(self): # pylint: disable=super-init-not-called
|
||||
self.handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
||||
|
||||
def _set_data_from_cuda_interface(self, data):
|
||||
'''Set data from CUDA array interface.'''
|
||||
@@ -1053,6 +997,116 @@ class DeviceQuantileDMatrix(DMatrix):
|
||||
)
|
||||
|
||||
|
||||
class DeviceQuantileDMatrix(DMatrix):
|
||||
"""Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do
|
||||
not use this for test/validation tasks as some information may be lost in
|
||||
quantisation. This DMatrix is primarily designed to save memory in training from
|
||||
device memory inputs by avoiding intermediate storage. Set max_bin to control the
|
||||
number of bins during quantisation. See doc string in :py:obj:`xgboost.DMatrix` for
|
||||
documents on meta info.
|
||||
|
||||
You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__( # pylint: disable=super-init-not-called
|
||||
self,
|
||||
data,
|
||||
label=None,
|
||||
*,
|
||||
weight=None,
|
||||
base_margin=None,
|
||||
missing=None,
|
||||
silent=False,
|
||||
feature_names=None,
|
||||
feature_types=None,
|
||||
nthread: Optional[int] = None,
|
||||
max_bin: int = 256,
|
||||
group=None,
|
||||
qid=None,
|
||||
label_lower_bound=None,
|
||||
label_upper_bound=None,
|
||||
feature_weights=None,
|
||||
enable_categorical: bool = False,
|
||||
):
|
||||
self.max_bin = max_bin
|
||||
self.missing = missing if missing is not None else np.nan
|
||||
self.nthread = nthread if nthread is not None else 1
|
||||
self._silent = silent # unused, kept for compatibility
|
||||
|
||||
if isinstance(data, ctypes.c_void_p):
|
||||
self.handle = data
|
||||
return
|
||||
|
||||
if enable_categorical:
|
||||
raise NotImplementedError(
|
||||
'categorical support is not enabled on DeviceQuantileDMatrix.'
|
||||
)
|
||||
if qid is not None and group is not None:
|
||||
raise ValueError(
|
||||
'Only one of the eval_qid or eval_group for each evaluation '
|
||||
'dataset should be provided.'
|
||||
)
|
||||
|
||||
self._init(
|
||||
data,
|
||||
label=label,
|
||||
weight=weight,
|
||||
base_margin=base_margin,
|
||||
group=group,
|
||||
qid=qid,
|
||||
label_lower_bound=label_lower_bound,
|
||||
label_upper_bound=label_upper_bound,
|
||||
feature_weights=feature_weights,
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
)
|
||||
|
||||
def _init(self, data, feature_names, feature_types, **meta):
|
||||
from .data import (
|
||||
_is_dlpack,
|
||||
_transform_dlpack,
|
||||
_is_iter,
|
||||
SingleBatchInternalIter,
|
||||
)
|
||||
|
||||
if _is_dlpack(data):
|
||||
# We specialize for dlpack because cupy will take the memory from it so
|
||||
# it can't be transformed twice.
|
||||
data = _transform_dlpack(data)
|
||||
if _is_iter(data):
|
||||
it = data
|
||||
else:
|
||||
it = SingleBatchInternalIter(
|
||||
data, **meta, feature_names=feature_names, feature_types=feature_types
|
||||
)
|
||||
|
||||
reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(it.reset_wrapper)
|
||||
next_callback = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int,
|
||||
ctypes.c_void_p,
|
||||
)(it.next_wrapper)
|
||||
handle = ctypes.c_void_p()
|
||||
ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
|
||||
None,
|
||||
it.proxy.handle,
|
||||
reset_callback,
|
||||
next_callback,
|
||||
ctypes.c_float(self.missing),
|
||||
ctypes.c_int(self.nthread),
|
||||
ctypes.c_int(self.max_bin),
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
if it.exception:
|
||||
raise it.exception
|
||||
# delay check_call to throw intermediate exception first
|
||||
_check_call(ret)
|
||||
self.handle = handle
|
||||
|
||||
|
||||
Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
|
||||
Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
|
||||
|
||||
@@ -1346,7 +1400,7 @@ class Booster(object):
|
||||
|
||||
def boost(self, dtrain, grad, hess):
|
||||
"""Boost the booster for one iteration, with customized gradient
|
||||
statistics. Like :func:`xgboost.core.Booster.update`, this
|
||||
statistics. Like :py:func:`xgboost.Booster.update`, this
|
||||
function should not be called directly by users.
|
||||
|
||||
Parameters
|
||||
@@ -1360,7 +1414,9 @@ class Booster(object):
|
||||
|
||||
"""
|
||||
if len(grad) != len(hess):
|
||||
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
|
||||
raise ValueError(
|
||||
'grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))
|
||||
)
|
||||
if not isinstance(dtrain, DMatrix):
|
||||
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
|
||||
self._validate_features(dtrain)
|
||||
@@ -1453,17 +1509,12 @@ class Booster(object):
|
||||
training=False):
|
||||
"""Predict with data.
|
||||
|
||||
.. note:: This function is not thread safe except for ``gbtree``
|
||||
booster.
|
||||
.. note:: This function is not thread safe except for ``gbtree`` booster.
|
||||
|
||||
For ``gbtree`` booster, the thread safety is guaranteed by locks.
|
||||
For lock free prediction use ``inplace_predict`` instead. Also, the
|
||||
safety does not hold when used in conjunction with other methods.
|
||||
|
||||
When using booster other than ``gbtree``, predict can only be called
|
||||
from one thread. If you want to run prediction using multiple
|
||||
thread, call ``bst.copy()`` to make copies of model object and then
|
||||
call ``predict()``.
|
||||
When using booster other than ``gbtree``, predict can only be called from one
|
||||
thread. If you want to run prediction using multiple thread, call
|
||||
:py:meth:`xgboost.Booster.copy` to make copies of model object and then call
|
||||
``predict()``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -1579,9 +1630,17 @@ class Booster(object):
|
||||
preds = preds.reshape(nrow, chunk_size)
|
||||
return preds
|
||||
|
||||
def inplace_predict(self, data, iteration_range=(0, 0),
|
||||
predict_type='value', missing=np.nan):
|
||||
'''Run prediction in-place, Unlike ``predict`` method, inplace prediction does
|
||||
def inplace_predict(
|
||||
self,
|
||||
data,
|
||||
iteration_range: Tuple[int, int] = (0, 0),
|
||||
predict_type: str = "value",
|
||||
missing: float = np.nan,
|
||||
validate_features: bool = True,
|
||||
base_margin: Any = None,
|
||||
strict_shape: bool = False
|
||||
):
|
||||
"""Run prediction in-place, Unlike ``predict`` method, inplace prediction does
|
||||
not cache the prediction result.
|
||||
|
||||
Calling only ``inplace_predict`` in multiple threads is safe and lock
|
||||
@@ -1617,6 +1676,15 @@ class Booster(object):
|
||||
missing : float
|
||||
Value in the input data which needs to be present as a missing
|
||||
value.
|
||||
validate_features:
|
||||
See :py:meth:`xgboost.Booster.predict` for details.
|
||||
base_margin:
|
||||
See :py:obj:`xgboost.DMatrix` for details.
|
||||
strict_shape:
|
||||
When set to True, output shape is invariant to whether classification is used.
|
||||
For both value and margin prediction, the output shape is (n_samples,
|
||||
n_groups), n_groups == 1 when multi-class is not used. Default to False, in
|
||||
which case the output shape can be (n_samples, ) if multi-class is not used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -1624,107 +1692,117 @@ class Booster(object):
|
||||
The prediction result. When input data is on GPU, prediction
|
||||
result is stored in a cupy array.
|
||||
|
||||
'''
|
||||
|
||||
def reshape_output(predt, rows):
|
||||
'''Reshape for multi-output prediction.'''
|
||||
if predt.size != rows and predt.size % rows == 0:
|
||||
cols = int(predt.size / rows)
|
||||
predt = predt.reshape(rows, cols)
|
||||
return predt
|
||||
return predt
|
||||
|
||||
length = c_bst_ulong()
|
||||
"""
|
||||
preds = ctypes.POINTER(ctypes.c_float)()
|
||||
iteration_range = (ctypes.c_uint(iteration_range[0]),
|
||||
ctypes.c_uint(iteration_range[1]))
|
||||
|
||||
# once caching is supported, we can pass id(data) as cache id.
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
if isinstance(data, pd.DataFrame):
|
||||
data = data.values
|
||||
except ImportError:
|
||||
pass
|
||||
args = {
|
||||
"type": 0,
|
||||
"training": False,
|
||||
"iteration_begin": iteration_range[0],
|
||||
"iteration_end": iteration_range[1],
|
||||
"missing": missing,
|
||||
"strict_shape": strict_shape,
|
||||
"cache_id": 0,
|
||||
}
|
||||
if predict_type == "margin":
|
||||
args["type"] = 1
|
||||
shape = ctypes.POINTER(c_bst_ulong)()
|
||||
dims = c_bst_ulong()
|
||||
|
||||
if base_margin is not None:
|
||||
proxy = _ProxyDMatrix()
|
||||
proxy.set_info(base_margin=base_margin)
|
||||
p_handle = proxy.handle
|
||||
else:
|
||||
proxy = None
|
||||
p_handle = ctypes.c_void_p()
|
||||
assert proxy is None or isinstance(proxy, _ProxyDMatrix)
|
||||
if validate_features:
|
||||
if len(data.shape) != 1 and self.num_features() != data.shape[1]:
|
||||
raise ValueError(
|
||||
f"Feature shape mismatch, expected: {self.num_features()}, "
|
||||
f"got {data.shape[0]}"
|
||||
)
|
||||
|
||||
if isinstance(data, np.ndarray):
|
||||
assert data.flags.c_contiguous
|
||||
arr = np.array(data.reshape(data.size), copy=False,
|
||||
dtype=np.float32)
|
||||
_check_call(_LIB.XGBoosterPredictFromDense(
|
||||
self.handle,
|
||||
arr.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
c_bst_ulong(data.shape[0]),
|
||||
c_bst_ulong(data.shape[1]),
|
||||
ctypes.c_float(missing),
|
||||
iteration_range[0],
|
||||
iteration_range[1],
|
||||
c_str(predict_type),
|
||||
c_bst_ulong(0),
|
||||
ctypes.byref(length),
|
||||
ctypes.byref(preds)
|
||||
))
|
||||
preds = ctypes2numpy(preds, length.value, np.float32)
|
||||
rows = data.shape[0]
|
||||
return reshape_output(preds, rows)
|
||||
from .data import _maybe_np_slice
|
||||
data = _maybe_np_slice(data, data.dtype)
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromDense(
|
||||
self.handle,
|
||||
_array_interface(data),
|
||||
from_pystr_to_cstr(json.dumps(args)),
|
||||
p_handle,
|
||||
ctypes.byref(shape),
|
||||
ctypes.byref(dims),
|
||||
ctypes.byref(preds),
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, False)
|
||||
if isinstance(data, scipy.sparse.csr_matrix):
|
||||
csr = data
|
||||
_check_call(_LIB.XGBoosterPredictFromCSR(
|
||||
self.handle,
|
||||
c_array(ctypes.c_size_t, csr.indptr),
|
||||
c_array(ctypes.c_uint, csr.indices),
|
||||
c_array(ctypes.c_float, csr.data),
|
||||
ctypes.c_size_t(len(csr.indptr)),
|
||||
ctypes.c_size_t(len(csr.data)),
|
||||
ctypes.c_size_t(csr.shape[1]),
|
||||
ctypes.c_float(missing),
|
||||
iteration_range[0],
|
||||
iteration_range[1],
|
||||
c_str(predict_type),
|
||||
c_bst_ulong(0),
|
||||
ctypes.byref(length),
|
||||
ctypes.byref(preds)))
|
||||
preds = ctypes2numpy(preds, length.value, np.float32)
|
||||
rows = data.shape[0]
|
||||
return reshape_output(preds, rows)
|
||||
if lazy_isinstance(data, 'cupy.core.core', 'ndarray'):
|
||||
assert data.flags.c_contiguous
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromCSR(
|
||||
self.handle,
|
||||
_array_interface(csr.indptr),
|
||||
_array_interface(csr.indices),
|
||||
_array_interface(csr.data),
|
||||
ctypes.c_size_t(csr.shape[1]),
|
||||
from_pystr_to_cstr(json.dumps(args)),
|
||||
p_handle,
|
||||
ctypes.byref(shape),
|
||||
ctypes.byref(dims),
|
||||
ctypes.byref(preds),
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, False)
|
||||
if lazy_isinstance(data, "cupy.core.core", "ndarray"):
|
||||
from .data import _transform_cupy_array
|
||||
data = _transform_cupy_array(data)
|
||||
interface = data.__cuda_array_interface__
|
||||
if 'mask' in interface:
|
||||
interface['mask'] = interface['mask'].__cuda_array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
|
||||
_check_call(_LIB.XGBoosterPredictFromArrayInterface(
|
||||
self.handle,
|
||||
interface_str,
|
||||
ctypes.c_float(missing),
|
||||
iteration_range[0],
|
||||
iteration_range[1],
|
||||
c_str(predict_type),
|
||||
c_bst_ulong(0),
|
||||
ctypes.byref(length),
|
||||
ctypes.byref(preds)))
|
||||
mem = ctypes2cupy(preds, length, np.float32)
|
||||
rows = data.shape[0]
|
||||
return reshape_output(mem, rows)
|
||||
if lazy_isinstance(data, 'cudf.core.dataframe', 'DataFrame'):
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromArrayInterface(
|
||||
self.handle,
|
||||
interface_str,
|
||||
from_pystr_to_cstr(json.dumps(args)),
|
||||
p_handle,
|
||||
ctypes.byref(shape),
|
||||
ctypes.byref(dims),
|
||||
ctypes.byref(preds),
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, True)
|
||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
||||
from .data import _cudf_array_interfaces
|
||||
interfaces_str = _cudf_array_interfaces(data)
|
||||
_check_call(_LIB.XGBoosterPredictFromArrayInterfaceColumns(
|
||||
self.handle,
|
||||
interfaces_str,
|
||||
ctypes.c_float(missing),
|
||||
iteration_range[0],
|
||||
iteration_range[1],
|
||||
c_str(predict_type),
|
||||
c_bst_ulong(0),
|
||||
ctypes.byref(length),
|
||||
ctypes.byref(preds)))
|
||||
mem = ctypes2cupy(preds, length, np.float32)
|
||||
rows = data.shape[0]
|
||||
predt = reshape_output(mem, rows)
|
||||
return predt
|
||||
|
||||
raise TypeError('Data type:' + str(type(data)) +
|
||||
' not supported by inplace prediction.')
|
||||
interfaces_str = _cudf_array_interfaces(data)
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromArrayInterfaceColumns(
|
||||
self.handle,
|
||||
interfaces_str,
|
||||
from_pystr_to_cstr(json.dumps(args)),
|
||||
p_handle,
|
||||
ctypes.byref(shape),
|
||||
ctypes.byref(dims),
|
||||
ctypes.byref(preds),
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, True)
|
||||
|
||||
raise TypeError(
|
||||
"Data type:" + str(type(data)) + " not supported by inplace prediction."
|
||||
)
|
||||
|
||||
def save_model(self, fname):
|
||||
"""Save the model to a file.
|
||||
|
||||
@@ -187,8 +187,8 @@ class DaskDMatrix:
|
||||
`DaskDMatrix` forces all lazy computation to be carried out. Wait for the input data
|
||||
explicitly if you want to see actual computation of constructing `DaskDMatrix`.
|
||||
|
||||
See doc string for DMatrix constructor for other parameters. DaskDMatrix accepts only
|
||||
dask collection.
|
||||
See doc for :py:obj:`xgboost.DMatrix` constructor for other parameters. DaskDMatrix
|
||||
accepts only dask collection.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -575,7 +575,8 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
|
||||
memory usage by eliminating data copies. Internally the all partitions/chunks of data
|
||||
are merged by weighted GK sketching. So the number of partitions from dask may affect
|
||||
training accuracy as GK generates bounded error for each merge. See doc string for
|
||||
`DeviceQuantileDMatrix` and `DMatrix` for other parameters.
|
||||
:py:obj:`xgboost.DeviceQuantileDMatrix` and :py:obj:`xgboost.DMatrix` for other
|
||||
parameters.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
|
||||
|
||||
@@ -5,11 +5,12 @@ import ctypes
|
||||
import json
|
||||
import warnings
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .core import c_array, _LIB, _check_call, c_str
|
||||
from .core import DataIter, DeviceQuantileDMatrix, DMatrix
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||
from .compat import lazy_isinstance
|
||||
|
||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
@@ -113,7 +114,7 @@ def _maybe_np_slice(data, dtype):
|
||||
return data
|
||||
|
||||
|
||||
def _transform_np_array(data: np.ndarray):
|
||||
def _transform_np_array(data: np.ndarray) -> np.ndarray:
|
||||
if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
|
||||
data = np.array(data, copy=False)
|
||||
if len(data.shape) != 2:
|
||||
@@ -142,7 +143,7 @@ def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
|
||||
input layout and type if memory use is a concern.
|
||||
|
||||
"""
|
||||
flatten = _transform_np_array(data)
|
||||
flatten: np.ndarray = _transform_np_array(data)
|
||||
handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGDMatrixCreateFromMat_omp(
|
||||
flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
@@ -783,54 +784,6 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
|
||||
self.it = 0
|
||||
|
||||
|
||||
def init_device_quantile_dmatrix(
|
||||
data, missing, max_bin, threads, feature_names, feature_types, **meta
|
||||
):
|
||||
'''Constructor for DeviceQuantileDMatrix.'''
|
||||
if not any([_is_cudf_df(data), _is_cudf_ser(data), _is_cupy_array(data),
|
||||
_is_dlpack(data), _is_iter(data)]):
|
||||
raise TypeError(str(type(data)) +
|
||||
' is not supported for DeviceQuantileDMatrix')
|
||||
if _is_dlpack(data):
|
||||
# We specialize for dlpack because cupy will take the memory from it so
|
||||
# it can't be transformed twice.
|
||||
data = _transform_dlpack(data)
|
||||
if _is_iter(data):
|
||||
it = data
|
||||
else:
|
||||
it = SingleBatchInternalIter(
|
||||
data, **meta, feature_names=feature_names,
|
||||
feature_types=feature_types)
|
||||
|
||||
reset_factory = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
||||
reset_callback = reset_factory(it.reset_wrapper)
|
||||
next_factory = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int,
|
||||
ctypes.c_void_p,
|
||||
)
|
||||
next_callback = next_factory(it.next_wrapper)
|
||||
handle = ctypes.c_void_p()
|
||||
ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
|
||||
None,
|
||||
it.proxy.handle,
|
||||
reset_callback,
|
||||
next_callback,
|
||||
ctypes.c_float(missing),
|
||||
ctypes.c_int(threads),
|
||||
ctypes.c_int(max_bin),
|
||||
ctypes.byref(handle)
|
||||
)
|
||||
if it.exception:
|
||||
raise it.exception
|
||||
# delay check_call to throw intermediate exception first
|
||||
_check_call(ret)
|
||||
matrix = DeviceQuantileDMatrix(handle)
|
||||
feature_names = matrix.feature_names
|
||||
feature_types = matrix.feature_types
|
||||
matrix.handle = None
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _device_quantile_transform(data, feature_names, feature_types):
|
||||
if _is_cudf_df(data):
|
||||
return _transform_cudf_df(data, feature_names, feature_types)
|
||||
@@ -845,7 +798,7 @@ def _device_quantile_transform(data, feature_names, feature_types):
|
||||
str(type(data)))
|
||||
|
||||
|
||||
def dispatch_device_quantile_dmatrix_set_data(proxy, data):
|
||||
def dispatch_device_quantile_dmatrix_set_data(proxy: _ProxyDMatrix, data: Any) -> None:
|
||||
'''Dispatch for DeviceQuantileDMatrix.'''
|
||||
if _is_cudf_df(data):
|
||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
||||
|
||||
Reference in New Issue
Block a user