Device dmatrix (#5420)

This commit is contained in:
Rory Mitchell
2020-03-28 14:42:21 +13:00
committed by GitHub
parent 780de49ddb
commit 13b10a6370
24 changed files with 915 additions and 310 deletions

View File

@@ -8,12 +8,13 @@ import os
import sys
import warnings
from .core import DMatrix, Booster
from .core import DMatrix, DeviceQuantileDMatrix, Booster
from .training import train, cv
from . import rabit # noqa
from . import rabit # noqa
from . import tracker # noqa
from .tracker import RabitTracker # noqa
from . import dask
try:
from .sklearn import XGBModel, XGBClassifier, XGBRegressor, XGBRanker
from .sklearn import XGBRFClassifier, XGBRFRegressor
@@ -31,7 +32,7 @@ VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION')
with open(VERSION_FILE) as f:
__version__ = f.read().strip()
__all__ = ['DMatrix', 'Booster',
__all__ = ['DMatrix', 'DeviceQuantileDMatrix', 'Booster',
'train', 'cv',
'RabitTracker',
'XGBModel', 'XGBClassifier', 'XGBRegressor', 'XGBRanker',

View File

@@ -291,6 +291,18 @@ def _maybe_pandas_data(data, feature_names, feature_types,
return data, feature_names, feature_types
def _cudf_array_interfaces(df):
'''Extract CuDF __cuda_array_interface__'''
interfaces = []
for col in df:
interface = df[col].__cuda_array_interface__
if 'mask' in interface:
interface['mask'] = interface['mask'].__cuda_array_interface__
interfaces.append(interface)
interfaces_str = bytes(json.dumps(interfaces, indent=2), 'utf-8')
return interfaces_str
def _maybe_cudf_dataframe(data, feature_names, feature_types):
"""Extract internal data from cudf.DataFrame for DMatrix data."""
if not (CUDF_INSTALLED and isinstance(data,
@@ -596,16 +608,10 @@ class DMatrix(object):
def _init_from_array_interface_columns(self, df, missing, nthread):
"""Initialize DMatrix from columnar memory format."""
interfaces = []
for col in df:
interface = df[col].__cuda_array_interface__
if 'mask' in interface:
interface['mask'] = interface['mask'].__cuda_array_interface__
interfaces.append(interface)
interfaces_str = _cudf_array_interfaces(df)
handle = ctypes.c_void_p()
missing = missing if missing is not None else np.nan
nthread = nthread if nthread is not None else 1
interfaces_str = bytes(json.dumps(interfaces, indent=2), 'utf-8')
_check_call(
_LIB.XGDMatrixCreateFromArrayInterfaceColumns(
interfaces_str,
@@ -1005,6 +1011,65 @@ class DMatrix(object):
self._feature_types = feature_types
class DeviceQuantileDMatrix(DMatrix):
"""Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do not
use this for test/validation tasks as some information may be lost in quantisation. This
DMatrix is primarily designed to save memory in training and avoids intermediate steps,
directly creating a compressed representation for training without allocating additional
memory. Implementation does not currently consider weights in quantisation process(unlike
DMatrix).
You can construct DeviceDMatrix from cupy/cudf
"""
def __init__(self, data, label=None, weight=None, base_margin=None,
missing=None,
silent=False,
feature_names=None,
feature_types=None,
nthread=None, max_bin=256):
self.max_bin = max_bin
if not (hasattr(data, "__cuda_array_interface__") or (
CUDF_INSTALLED and isinstance(data, CUDF_DataFrame))):
raise ValueError('Only cupy/cudf currently supported for DeviceDMatrix')
super().__init__(data, label=label, weight=weight, base_margin=base_margin,
missing=missing,
silent=silent,
feature_names=feature_names,
feature_types=feature_types,
nthread=nthread)
def _init_from_array_interface_columns(self, df, missing, nthread):
"""Initialize DMatrix from columnar memory format."""
interfaces_str = _cudf_array_interfaces(df)
handle = ctypes.c_void_p()
missing = missing if missing is not None else np.nan
nthread = nthread if nthread is not None else 1
_check_call(
_LIB.XGDeviceQuantileDMatrixCreateFromArrayInterfaceColumns(
interfaces_str,
ctypes.c_float(missing), ctypes.c_int(nthread),
ctypes.c_int(self.max_bin), ctypes.byref(handle)))
self.handle = handle
def _init_from_array_interface(self, data, missing, nthread):
"""Initialize DMatrix from cupy ndarray."""
interface = data.__cuda_array_interface__
if 'mask' in interface:
interface['mask'] = interface['mask'].__cuda_array_interface__
interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
handle = ctypes.c_void_p()
missing = missing if missing is not None else np.nan
nthread = nthread if nthread is not None else 1
_check_call(
_LIB.XGDeviceQuantileDMatrixCreateFromArrayInterface(
interface_str,
ctypes.c_float(missing), ctypes.c_int(nthread),
ctypes.c_int(self.max_bin), ctypes.byref(handle)))
self.handle = handle
class Booster(object):
# pylint: disable=too-many-public-methods
"""A Booster of XGBoost.