Support feature names/types for cudf. (#4902)

* Implement most of the pandas procedure for cudf except for type conversion. * Requires an array of interfaces in metainfo.
2019-09-29 15:07:51 -04:00 · 2019-09-29 15:07:51 -04:00 · d30e63a0a5
commit d30e63a0a5
parent 2fa8b359e0
6 changed files with 90 additions and 25 deletions
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@ -127,6 +127,10 @@ class MetaInfo {
   * \brief Set information in the meta info with array interface.
   * \param key The key of the information.
   * \param interface_str String representation of json format array interface.
+   *
+   *          [ column_0, column_1, ... column_n ]
+   *
+   *        Right now only 1 column is permitted.
   */
  void SetInfo(const char* key, std::string const& interface_str);

--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@ -133,10 +133,12 @@ except ImportError:
 try:
    from cudf import DataFrame as CUDF_DataFrame
    from cudf import Series as CUDF_Series
+    from cudf import MultiIndex as CUDF_MultiIndex
    CUDF_INSTALLED = True
 except ImportError:
    CUDF_DataFrame = object
    CUDF_Series = object
+    CUDF_MultiIndex = object
    CUDF_INSTALLED = False

 # sklearn
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -17,9 +17,10 @@ import json
 import numpy as np
 import scipy.sparse

-from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
+from .compat import (
+    STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
    PANDAS_INSTALLED, DataTable,
-                     CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
+    CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, CUDF_MultiIndex,
    os_fspath, os_PathLike)
 from .libpath import find_lib_path

@ -236,14 +237,18 @@ def c_str(string):

 def c_array(ctype, values):
    """Convert a python string to c array."""
-    if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
+    if (isinstance(values, np.ndarray)
+            and values.dtype.itemsize == ctypes.sizeof(ctype)):
        return (ctype * len(values)).from_buffer_copy(values)
    return (ctype * len(values))(*values)


 def _use_columnar_initializer(data):
-    '''Whether should we use columnar format initializer (pass data in as
-    json string).  Currently cudf is the only valid option.'''
+    '''Whether should we use columnar format initializer (pass data in as json
+    string).  Currently cudf is the only valid option.  For other dataframe
+    types, use their sepcific API instead.
+
+    '''
    if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
        return True
    return False
@ -258,7 +263,7 @@ def _extract_interface_from_cudf_series(data):
    return interface


-def _extract_interface_from_cudf(df, is_info):
+def _extract_interface_from_cudf(df):
    """This function should be upstreamed to cudf."""
    if not _use_columnar_initializer(df):
        raise ValueError('Only cudf is supported for initializing as json ' +
@ -273,9 +278,6 @@ def _extract_interface_from_cudf(df, is_info):
    else:
        array_interfaces.append(_extract_interface_from_cudf_series(df))

-    if is_info:
-        array_interfaces = array_interfaces[0]
-
    interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
    return interfaces

@ -337,6 +339,30 @@ def _maybe_pandas_label(label):
    return label


+def _maybe_cudf_dataframe(data, feature_names, feature_types):
+    '''Extract internal data from cudf.DataFrame for DMatrix data.'''
+    if not (CUDF_INSTALLED and isinstance(data,
+                                          (CUDF_DataFrame, CUDF_Series))):
+        return data, feature_names, feature_types
+    if feature_names is None:
+        if isinstance(data, CUDF_Series):
+            feature_names = [data.name]
+        elif isinstance(data.columns, CUDF_MultiIndex):
+            feature_names = [
+                ' '.join([str(x) for x in i])
+                for i in data.columns
+            ]
+        else:
+            feature_names = data.columns.format()
+    if feature_types is None:
+        if isinstance(data, CUDF_Series):
+            dtypes = [data.dtype]
+        else:
+            dtypes = data.dtypes
+        feature_types = [PANDAS_DTYPE_MAPPER[d.name] for d in dtypes]
+    return data, feature_names, feature_types
+
+
 DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'}

 DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
@ -384,6 +410,21 @@ def _maybe_dt_array(array):
    return array


+def _convert_dataframes(data, feature_names, feature_types):
+    data, feature_names, feature_types = _maybe_pandas_data(data,
+                                                            feature_names,
+                                                            feature_types)
+
+    data, feature_names, feature_types = _maybe_dt_data(data,
+                                                        feature_names,
+                                                        feature_types)
+
+    data, feature_names, feature_types = _maybe_cudf_dataframe(
+        data, feature_names, feature_types)
+
+    return data, feature_names, feature_types
+
+
 class DMatrix(object):
    """Data Matrix used in XGBoost.

@ -404,8 +445,10 @@ class DMatrix(object):
        data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
               dt.Frame/cudf.DataFrame
            Data source of DMatrix.
-            When data is string or os.PathLike type, it represents the path libsvm format
-            txt file, or binary file that xgboost can read from.
+            When data is string or os.PathLike type, it represents the path
+            libsvm format txt file, csv file (by specifying uri parameter
+            'path_to_csv?format=csv'), or binary file that xgboost can read
+            from.
        label : list, numpy 1-D array or cudf.DataFrame, optional
            Label of the training data.
        missing : float, optional
@ -445,13 +488,9 @@ class DMatrix(object):
        if isinstance(data, list):
            raise TypeError('Input data can not be a list.')

-        data, feature_names, feature_types = _maybe_pandas_data(data,
-                                                                feature_names,
-                                                                feature_types)
-
-        data, feature_names, feature_types = _maybe_dt_data(data,
-                                                            feature_names,
-                                                            feature_types)
+        data, feature_names, feature_types = _convert_dataframes(
+            data, feature_names, feature_types
+        )

        label = _maybe_pandas_label(label)
        label = _maybe_dt_array(label)
@ -604,7 +643,7 @@ class DMatrix(object):
        '''Initialize DMatrix from columnar memory format.

        '''
-        interfaces = _extract_interface_from_cudf(df, False)
+        interfaces = _extract_interface_from_cudf(df)
        handle = ctypes.c_void_p()
        has_missing = missing is not None
        missing = missing if has_missing else np.nan
@ -683,7 +722,7 @@ class DMatrix(object):

    def set_interface_info(self, field, data):
        '''Set info type peoperty into DMatrix.'''
-        interfaces = _extract_interface_from_cudf(data, True)
+        interfaces = _extract_interface_from_cudf(data)
        _check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
                                                       c_str(field),
                                                       interfaces))
--- a/src/data/data.cu
+++ b/src/data/data.cu
@ -35,8 +35,10 @@ void CopyInfoImpl(std::map<std::string, Json> const& column, HostDeviceVector<fl
 }

 void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
-  Json j_arr = Json::Load({interface_str.c_str(), interface_str.size()});
-  auto const& j_arr_obj = get<Object>(j_arr);
+  Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
+  auto const& j_arr = get<Array>(j_interface);
+  CHECK_EQ(j_arr.size(), 1) << "MetaInfo: " << c_key << ". " << ColumnarErrors::Dimension(1);;
+  auto const& j_arr_obj = get<Object const>(j_arr[0]);
  std::string key {c_key};
  ArrayInterfaceHandler::Validate(j_arr_obj);
  if (j_arr_obj.find("mask") != j_arr_obj.cend()) {
--- a/tests/cpp/data/test_metainfo.cu
+++ b/tests/cpp/data/test_metainfo.cu
@ -31,9 +31,10 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out) {
        Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
        Json(Boolean(false))};
  column["data"] = j_data;
+  Json array(std::vector<Json>{column});

  std::stringstream ss;
-  Json::Dump(column, &ss);
+  Json::Dump(array, &ss);
  std::string str = ss.str();

  return str;
--- a/tests/python-gpu/test_from_columnar.py
+++ b/tests/python-gpu/test_from_columnar.py
@ -42,6 +42,7 @@ Arrow specification.'''
    @pytest.mark.skipif(**tm.no_cudf())
    def test_from_cudf(self):
        '''Test constructing DMatrix from cudf'''
+        import cudf
        dmatrix_from_cudf(np.float32, np.NAN)
        dmatrix_from_cudf(np.float64, np.NAN)

@ -52,3 +53,19 @@ Arrow specification.'''
        dmatrix_from_cudf(np.int8, 2)
        dmatrix_from_cudf(np.int32, -2)
        dmatrix_from_cudf(np.int64, -3)
+
+        cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
+        dtrain = xgb.DMatrix(cd)
+
+        assert dtrain.feature_names == ['x', 'y']
+        assert dtrain.feature_types == ['int', 'float']
+
+        series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
+        assert isinstance(series, cudf.Series)
+        dtrain = xgb.DMatrix(series)
+
+        assert dtrain.feature_names == ['x']
+        assert dtrain.feature_types == ['int']
+
+        with pytest.raises(Exception):
+            dtrain = xgb.DMatrix(cd, label=cd)