Cudf support. (#4745)

* Initial support for cudf integration. * Add two C APIs for consuming data and metainfo. * Add CopyFrom for SimpleCSRSource as a generic function to consume the data. * Add FromDeviceColumnar for consuming device data. * Add new MetaInfo::SetInfo for consuming label, weight etc.
2019-08-19 00:51:40 -04:00
parent ab357dd41c
commit 9700776597
26 changed files with 1385 additions and 287 deletions
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -124,9 +124,16 @@ except ImportError:
    class DataTable(object):
        """ dummy for datatable.DataTable """

-
    DT_INSTALLED = False

+
+try:
+    from cudf import DataFrame as CUDF_DataFrame
+    CUDF_INSTALLED = True
+except ImportError:
+    CUDF_DataFrame = object
+    CUDF_INSTALLED = False
+
 # sklearn
 try:
    from sklearn.base import BaseEstimator
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1,26 +1,27 @@
 # coding: utf-8
 # pylint: disable=too-many-arguments, too-many-branches, invalid-name
 # pylint: disable=too-many-branches, too-many-lines, too-many-locals
+# pylint: disable=too-many-public-methods
 """Core XGBoost Library."""
-from __future__ import absolute_import
 import collections
 # pylint: disable=no-name-in-module,import-error
-try:
-    from collections.abc import Mapping  # Python 3
-except ImportError:
-    from collections import Mapping  # Python 2
+from collections.abc import Mapping  # Python 3
 # pylint: enable=no-name-in-module,import-error
+import math
 import ctypes
 import os
 import re
 import sys
 import warnings
+import json

 import numpy as np
 import scipy.sparse

 from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
-                     PANDAS_INSTALLED, DataTable, os_fspath, os_PathLike)
+                     PANDAS_INSTALLED, DataTable,
+                     CUDF_INSTALLED, CUDF_DataFrame,
+                     os_fspath, os_PathLike)
 from .libpath import find_lib_path


@@ -131,8 +132,10 @@ def _load_lib():
    os_error_list = []
    for lib_path in lib_paths:
        try:
-            # needed when the lib is linked with non-system-available dependencies
-            os.environ['PATH'] = os.pathsep.join(pathBackup + [os.path.dirname(lib_path)])
+            # needed when the lib is linked with non-system-available
+            # dependencies
+            os.environ['PATH'] = os.pathsep.join(
+                pathBackup + [os.path.dirname(lib_path)])
            lib = ctypes.cdll.LoadLibrary(lib_path)
            lib_success = True
        except OSError as e:
@@ -217,6 +220,51 @@ def c_array(ctype, values):
    return (ctype * len(values))(*values)


+def _use_columnar_initializer(data):
+    '''Whether should we use columnar format initializer (pass data in as
+json string).  Currently cudf is the only valid option.'''
+    if CUDF_INSTALLED and isinstance(data, CUDF_DataFrame):
+        return True
+    return False
+
+
+def _extract_interface_from_cudf(df, is_info):
+    '''This function should be upstreamed to cudf.'''
+    if not _use_columnar_initializer(df):
+        raise ValueError('Only cudf is supported for initializing as json ' +
+                         'columnar format.  For other libraries please ' +
+                         'refer to specific API.')
+
+    def get_interface(obj):
+        return obj.mem.__cuda_array_interface__
+
+    array_interfaces = []
+    for col in df.columns:
+        data = df[col].data
+        array_interfaces.append(get_interface(data))
+
+    validity_masks = []
+    for col in df.columns:
+        if df[col].has_null_mask:
+            mask_interface = get_interface(df[col].nullmask)
+            mask_interface['null_count'] = df[col].null_count
+            validity_masks.append(mask_interface)
+        else:
+            validity_masks.append(False)
+
+    for i in range(len(df.columns)):
+        col_interface = array_interfaces[i]
+        mask_interface = validity_masks[i]
+        if mask_interface is not False:
+            col_interface['mask'] = mask_interface
+
+    if is_info:
+        array_interfaces = array_interfaces[0]
+
+    interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
+    return interfaces
+
+
 PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
                       'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
                       'float16': 'float', 'float32': 'float', 'float64': 'float',
@@ -256,15 +304,18 @@ def _maybe_pandas_data(data, feature_names, feature_types):


 def _maybe_pandas_label(label):
-    """ Extract internal data from pd.DataFrame for DMatrix label """
+    """Extract internal data from pd.DataFrame for DMatrix label."""

    if PANDAS_INSTALLED and isinstance(label, DataFrame):
        if len(label.columns) > 1:
-            raise ValueError('DataFrame for label cannot have multiple columns')
+            raise ValueError(
+                'DataFrame for label cannot have multiple columns')

        label_dtypes = label.dtypes
-        if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
-            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
+        if not all(dtype.name in PANDAS_DTYPE_MAPPER
+                   for dtype in label_dtypes):
+            raise ValueError(
+                'DataFrame.dtypes for label must be int, float or bool')
        label = label.values.astype('float')
    # pd.Series can be passed to xgb as it is

@@ -318,6 +369,22 @@ def _maybe_dt_array(array):
    return array


+def _check_data(data, missing):
+    '''The missing value applies only to np.ndarray.'''
+    is_invalid = (not isinstance(data, np.ndarray)) and (missing is not None)
+    is_invalid = is_invalid and not math.isnan(missing)
+    if is_invalid:
+        raise ValueError(
+            'missing value only applies to dense input, ' +
+            'e.g. `numpy.ndarray`.' +
+            ' For a possibly sparse data type: ' + str(type(data)) +
+            ' please remove missing values or set it to nan.' +
+            ' Current missing value is set to: ' + str(missing))
+    if isinstance(data, list):
+        warnings.warn('Initializing DMatrix from List is deprecated.',
+                      DeprecationWarning)
+
+
 class DMatrix(object):
    """Data Matrix used in XGBoost.

@@ -336,15 +403,16 @@ class DMatrix(object):
        """
        Parameters
        ----------
-        data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame
+        data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
+               dt.Frame/cudf.DataFrame
            Data source of DMatrix.
            When data is string or os.PathLike type, it represents the path libsvm format
            txt file, or binary file that xgboost can read from.
        label : list or numpy 1-D array, optional
            Label of the training data.
        missing : float, optional
-            Value in the data which needs to be present as a missing value. If
-            None, defaults to np.nan.
+            Value in the dense input data (e.g. `numpy.ndarray`) which needs
+            to be present as a missing value. If None, defaults to np.nan.
        weight : list or numpy 1-D array , optional
            Weight for each instance.

@@ -375,6 +443,8 @@ class DMatrix(object):
                self._feature_types = feature_types
            return

+        _check_data(data, missing)
+
        data, feature_names, feature_types = _maybe_pandas_data(data,
                                                                feature_names,
                                                                feature_types)
@@ -382,14 +452,11 @@ class DMatrix(object):
        data, feature_names, feature_types = _maybe_dt_data(data,
                                                            feature_names,
                                                            feature_types)
+
        label = _maybe_pandas_label(label)
        label = _maybe_dt_array(label)
        weight = _maybe_dt_array(weight)

-        if isinstance(data, list):
-            warnings.warn('Initializing DMatrix from List is deprecated.',
-                          DeprecationWarning)
-
        if isinstance(data, (STRING_TYPES, os_PathLike)):
            handle = ctypes.c_void_p()
            _check_call(_LIB.XGDMatrixCreateFromFile(c_str(os_fspath(data)),
@@ -404,6 +471,8 @@ class DMatrix(object):
            self._init_from_npy2d(data, missing, nthread)
        elif isinstance(data, DataTable):
            self._init_from_dt(data, nthread)
+        elif _use_columnar_initializer(data):
+            self._init_from_columnar(data)
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
@@ -415,11 +484,15 @@ class DMatrix(object):
        if label is not None:
            if isinstance(label, np.ndarray):
                self.set_label_npy2d(label)
+            elif _use_columnar_initializer(label):
+                self.set_interface_info('label', label)
            else:
                self.set_label(label)
        if weight is not None:
            if isinstance(weight, np.ndarray):
                self.set_weight_npy2d(weight)
+            elif _use_columnar_initializer(label):
+                self.set_interface_info('weight', weight)
            else:
                self.set_weight(weight)

@@ -526,8 +599,19 @@ class DMatrix(object):
            nthread))
        self.handle = handle

+    def _init_from_columnar(self, df):
+        '''Initialize DMatrix from columnar memory format.
+
+        '''
+        interfaces = _extract_interface_from_cudf(df, False)
+        handle = ctypes.c_void_p()
+        _check_call(
+            _LIB.XGDMatrixCreateFromArrayInterfaces(interfaces,
+                                                    ctypes.byref(handle)))
+        self.handle = handle
+
    def __del__(self):
-        if hasattr(self, "handle") and self.handle is not None:
+        if hasattr(self, "handle") and self.handle:
            _check_call(_LIB.XGDMatrixFree(self.handle))
            self.handle = None

@@ -593,6 +677,13 @@ class DMatrix(object):
                                               c_data,
                                               c_bst_ulong(len(data))))

+    def set_interface_info(self, field, data):
+        '''Set info type peoperty into DMatrix.'''
+        interfaces = _extract_interface_from_cudf(data, True)
+        _check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
+                                                       c_str(field),
+                                                       interfaces))
+
    def set_float_info_npy2d(self, field, data):
        """Set float type property into the DMatrix
           for numpy 2d array input
@@ -732,7 +823,10 @@ class DMatrix(object):
        margin: array like
            Prediction margin of each datapoint
        """
-        self.set_float_info('base_margin', margin)
+        if _use_columnar_initializer(margin):
+            self.set_interface_info('base_margin', margin)
+        else:
+            self.set_float_info('base_margin', margin)

    def set_group(self, group):
        """Set group size of DMatrix (used for ranking).
@@ -742,9 +836,12 @@ class DMatrix(object):
        group : array like
            Group size of each group
        """
-        _check_call(_LIB.XGDMatrixSetGroup(self.handle,
-                                           c_array(ctypes.c_uint, group),
-                                           c_bst_ulong(len(group))))
+        if _use_columnar_initializer(group):
+            self.set_interface_info('group', group)
+        else:
+            _check_call(_LIB.XGDMatrixSetGroup(self.handle,
+                                               c_array(ctypes.c_uint, group),
+                                               c_bst_ulong(len(group))))

    def get_label(self):
        """Get the label of the DMatrix.
@@ -831,7 +928,8 @@ class DMatrix(object):
        feature_names : list or None
        """
        if self._feature_names is None:
-            self._feature_names = ['f{0}'.format(i) for i in range(self.num_col())]
+            self._feature_names = ['f{0}'.format(i)
+                                   for i in range(self.num_col())]
        return self._feature_names

    @property