Cudf support. (#4745)

* Initial support for cudf integration.

* Add two C APIs for consuming data and metainfo.

* Add CopyFrom for SimpleCSRSource as a generic function to consume the data.

* Add FromDeviceColumnar for consuming device data.

* Add new MetaInfo::SetInfo for consuming label, weight etc.
This commit is contained in:
Jiaming Yuan
2019-08-19 00:51:40 -04:00
committed by Rory Mitchell
parent ab357dd41c
commit 9700776597
26 changed files with 1385 additions and 287 deletions

View File

@@ -124,9 +124,16 @@ except ImportError:
class DataTable(object):
""" dummy for datatable.DataTable """
DT_INSTALLED = False
try:
from cudf import DataFrame as CUDF_DataFrame
CUDF_INSTALLED = True
except ImportError:
CUDF_DataFrame = object
CUDF_INSTALLED = False
# sklearn
try:
from sklearn.base import BaseEstimator

View File

@@ -1,26 +1,27 @@
# coding: utf-8
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
# pylint: disable=too-many-branches, too-many-lines, too-many-locals
# pylint: disable=too-many-public-methods
"""Core XGBoost Library."""
from __future__ import absolute_import
import collections
# pylint: disable=no-name-in-module,import-error
try:
from collections.abc import Mapping # Python 3
except ImportError:
from collections import Mapping # Python 2
from collections.abc import Mapping # Python 3
# pylint: enable=no-name-in-module,import-error
import math
import ctypes
import os
import re
import sys
import warnings
import json
import numpy as np
import scipy.sparse
from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
PANDAS_INSTALLED, DataTable, os_fspath, os_PathLike)
PANDAS_INSTALLED, DataTable,
CUDF_INSTALLED, CUDF_DataFrame,
os_fspath, os_PathLike)
from .libpath import find_lib_path
@@ -131,8 +132,10 @@ def _load_lib():
os_error_list = []
for lib_path in lib_paths:
try:
# needed when the lib is linked with non-system-available dependencies
os.environ['PATH'] = os.pathsep.join(pathBackup + [os.path.dirname(lib_path)])
# needed when the lib is linked with non-system-available
# dependencies
os.environ['PATH'] = os.pathsep.join(
pathBackup + [os.path.dirname(lib_path)])
lib = ctypes.cdll.LoadLibrary(lib_path)
lib_success = True
except OSError as e:
@@ -217,6 +220,51 @@ def c_array(ctype, values):
return (ctype * len(values))(*values)
def _use_columnar_initializer(data):
'''Whether should we use columnar format initializer (pass data in as
json string). Currently cudf is the only valid option.'''
if CUDF_INSTALLED and isinstance(data, CUDF_DataFrame):
return True
return False
def _extract_interface_from_cudf(df, is_info):
'''This function should be upstreamed to cudf.'''
if not _use_columnar_initializer(df):
raise ValueError('Only cudf is supported for initializing as json ' +
'columnar format. For other libraries please ' +
'refer to specific API.')
def get_interface(obj):
return obj.mem.__cuda_array_interface__
array_interfaces = []
for col in df.columns:
data = df[col].data
array_interfaces.append(get_interface(data))
validity_masks = []
for col in df.columns:
if df[col].has_null_mask:
mask_interface = get_interface(df[col].nullmask)
mask_interface['null_count'] = df[col].null_count
validity_masks.append(mask_interface)
else:
validity_masks.append(False)
for i in range(len(df.columns)):
col_interface = array_interfaces[i]
mask_interface = validity_masks[i]
if mask_interface is not False:
col_interface['mask'] = mask_interface
if is_info:
array_interfaces = array_interfaces[0]
interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
return interfaces
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float',
@@ -256,15 +304,18 @@ def _maybe_pandas_data(data, feature_names, feature_types):
def _maybe_pandas_label(label):
""" Extract internal data from pd.DataFrame for DMatrix label """
"""Extract internal data from pd.DataFrame for DMatrix label."""
if PANDAS_INSTALLED and isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
raise ValueError(
'DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
if not all(dtype.name in PANDAS_DTYPE_MAPPER
for dtype in label_dtypes):
raise ValueError(
'DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float')
# pd.Series can be passed to xgb as it is
@@ -318,6 +369,22 @@ def _maybe_dt_array(array):
return array
def _check_data(data, missing):
'''The missing value applies only to np.ndarray.'''
is_invalid = (not isinstance(data, np.ndarray)) and (missing is not None)
is_invalid = is_invalid and not math.isnan(missing)
if is_invalid:
raise ValueError(
'missing value only applies to dense input, ' +
'e.g. `numpy.ndarray`.' +
' For a possibly sparse data type: ' + str(type(data)) +
' please remove missing values or set it to nan.' +
' Current missing value is set to: ' + str(missing))
if isinstance(data, list):
warnings.warn('Initializing DMatrix from List is deprecated.',
DeprecationWarning)
class DMatrix(object):
"""Data Matrix used in XGBoost.
@@ -336,15 +403,16 @@ class DMatrix(object):
"""
Parameters
----------
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
dt.Frame/cudf.DataFrame
Data source of DMatrix.
When data is string or os.PathLike type, it represents the path libsvm format
txt file, or binary file that xgboost can read from.
label : list or numpy 1-D array, optional
Label of the training data.
missing : float, optional
Value in the data which needs to be present as a missing value. If
None, defaults to np.nan.
Value in the dense input data (e.g. `numpy.ndarray`) which needs
to be present as a missing value. If None, defaults to np.nan.
weight : list or numpy 1-D array , optional
Weight for each instance.
@@ -375,6 +443,8 @@ class DMatrix(object):
self._feature_types = feature_types
return
_check_data(data, missing)
data, feature_names, feature_types = _maybe_pandas_data(data,
feature_names,
feature_types)
@@ -382,14 +452,11 @@ class DMatrix(object):
data, feature_names, feature_types = _maybe_dt_data(data,
feature_names,
feature_types)
label = _maybe_pandas_label(label)
label = _maybe_dt_array(label)
weight = _maybe_dt_array(weight)
if isinstance(data, list):
warnings.warn('Initializing DMatrix from List is deprecated.',
DeprecationWarning)
if isinstance(data, (STRING_TYPES, os_PathLike)):
handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromFile(c_str(os_fspath(data)),
@@ -404,6 +471,8 @@ class DMatrix(object):
self._init_from_npy2d(data, missing, nthread)
elif isinstance(data, DataTable):
self._init_from_dt(data, nthread)
elif _use_columnar_initializer(data):
self._init_from_columnar(data)
else:
try:
csr = scipy.sparse.csr_matrix(data)
@@ -415,11 +484,15 @@ class DMatrix(object):
if label is not None:
if isinstance(label, np.ndarray):
self.set_label_npy2d(label)
elif _use_columnar_initializer(label):
self.set_interface_info('label', label)
else:
self.set_label(label)
if weight is not None:
if isinstance(weight, np.ndarray):
self.set_weight_npy2d(weight)
elif _use_columnar_initializer(label):
self.set_interface_info('weight', weight)
else:
self.set_weight(weight)
@@ -526,8 +599,19 @@ class DMatrix(object):
nthread))
self.handle = handle
def _init_from_columnar(self, df):
'''Initialize DMatrix from columnar memory format.
'''
interfaces = _extract_interface_from_cudf(df, False)
handle = ctypes.c_void_p()
_check_call(
_LIB.XGDMatrixCreateFromArrayInterfaces(interfaces,
ctypes.byref(handle)))
self.handle = handle
def __del__(self):
if hasattr(self, "handle") and self.handle is not None:
if hasattr(self, "handle") and self.handle:
_check_call(_LIB.XGDMatrixFree(self.handle))
self.handle = None
@@ -593,6 +677,13 @@ class DMatrix(object):
c_data,
c_bst_ulong(len(data))))
def set_interface_info(self, field, data):
'''Set info type peoperty into DMatrix.'''
interfaces = _extract_interface_from_cudf(data, True)
_check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
c_str(field),
interfaces))
def set_float_info_npy2d(self, field, data):
"""Set float type property into the DMatrix
for numpy 2d array input
@@ -732,7 +823,10 @@ class DMatrix(object):
margin: array like
Prediction margin of each datapoint
"""
self.set_float_info('base_margin', margin)
if _use_columnar_initializer(margin):
self.set_interface_info('base_margin', margin)
else:
self.set_float_info('base_margin', margin)
def set_group(self, group):
"""Set group size of DMatrix (used for ranking).
@@ -742,9 +836,12 @@ class DMatrix(object):
group : array like
Group size of each group
"""
_check_call(_LIB.XGDMatrixSetGroup(self.handle,
c_array(ctypes.c_uint, group),
c_bst_ulong(len(group))))
if _use_columnar_initializer(group):
self.set_interface_info('group', group)
else:
_check_call(_LIB.XGDMatrixSetGroup(self.handle,
c_array(ctypes.c_uint, group),
c_bst_ulong(len(group))))
def get_label(self):
"""Get the label of the DMatrix.
@@ -831,7 +928,8 @@ class DMatrix(object):
feature_names : list or None
"""
if self._feature_names is None:
self._feature_names = ['f{0}'.format(i) for i in range(self.num_col())]
self._feature_names = ['f{0}'.format(i)
for i in range(self.num_col())]
return self._feature_names
@property