Support feature names/types for cudf. (#4902)
* Implement most of the pandas procedure for cudf except for type conversion. * Requires an array of interfaces in metainfo.
This commit is contained in:
@@ -133,10 +133,12 @@ except ImportError:
|
||||
try:
|
||||
from cudf import DataFrame as CUDF_DataFrame
|
||||
from cudf import Series as CUDF_Series
|
||||
from cudf import MultiIndex as CUDF_MultiIndex
|
||||
CUDF_INSTALLED = True
|
||||
except ImportError:
|
||||
CUDF_DataFrame = object
|
||||
CUDF_Series = object
|
||||
CUDF_MultiIndex = object
|
||||
CUDF_INSTALLED = False
|
||||
|
||||
# sklearn
|
||||
|
||||
@@ -17,10 +17,11 @@ import json
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
|
||||
PANDAS_INSTALLED, DataTable,
|
||||
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
|
||||
os_fspath, os_PathLike)
|
||||
from .compat import (
|
||||
STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
|
||||
PANDAS_INSTALLED, DataTable,
|
||||
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, CUDF_MultiIndex,
|
||||
os_fspath, os_PathLike)
|
||||
from .libpath import find_lib_path
|
||||
|
||||
|
||||
@@ -236,14 +237,18 @@ def c_str(string):
|
||||
|
||||
def c_array(ctype, values):
|
||||
"""Convert a python string to c array."""
|
||||
if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
|
||||
if (isinstance(values, np.ndarray)
|
||||
and values.dtype.itemsize == ctypes.sizeof(ctype)):
|
||||
return (ctype * len(values)).from_buffer_copy(values)
|
||||
return (ctype * len(values))(*values)
|
||||
|
||||
|
||||
def _use_columnar_initializer(data):
|
||||
'''Whether should we use columnar format initializer (pass data in as
|
||||
json string). Currently cudf is the only valid option.'''
|
||||
'''Whether should we use columnar format initializer (pass data in as json
|
||||
string). Currently cudf is the only valid option. For other dataframe
|
||||
types, use their sepcific API instead.
|
||||
|
||||
'''
|
||||
if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
|
||||
return True
|
||||
return False
|
||||
@@ -258,7 +263,7 @@ def _extract_interface_from_cudf_series(data):
|
||||
return interface
|
||||
|
||||
|
||||
def _extract_interface_from_cudf(df, is_info):
|
||||
def _extract_interface_from_cudf(df):
|
||||
"""This function should be upstreamed to cudf."""
|
||||
if not _use_columnar_initializer(df):
|
||||
raise ValueError('Only cudf is supported for initializing as json ' +
|
||||
@@ -273,9 +278,6 @@ def _extract_interface_from_cudf(df, is_info):
|
||||
else:
|
||||
array_interfaces.append(_extract_interface_from_cudf_series(df))
|
||||
|
||||
if is_info:
|
||||
array_interfaces = array_interfaces[0]
|
||||
|
||||
interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
|
||||
return interfaces
|
||||
|
||||
@@ -337,6 +339,30 @@ def _maybe_pandas_label(label):
|
||||
return label
|
||||
|
||||
|
||||
def _maybe_cudf_dataframe(data, feature_names, feature_types):
|
||||
'''Extract internal data from cudf.DataFrame for DMatrix data.'''
|
||||
if not (CUDF_INSTALLED and isinstance(data,
|
||||
(CUDF_DataFrame, CUDF_Series))):
|
||||
return data, feature_names, feature_types
|
||||
if feature_names is None:
|
||||
if isinstance(data, CUDF_Series):
|
||||
feature_names = [data.name]
|
||||
elif isinstance(data.columns, CUDF_MultiIndex):
|
||||
feature_names = [
|
||||
' '.join([str(x) for x in i])
|
||||
for i in data.columns
|
||||
]
|
||||
else:
|
||||
feature_names = data.columns.format()
|
||||
if feature_types is None:
|
||||
if isinstance(data, CUDF_Series):
|
||||
dtypes = [data.dtype]
|
||||
else:
|
||||
dtypes = data.dtypes
|
||||
feature_types = [PANDAS_DTYPE_MAPPER[d.name] for d in dtypes]
|
||||
return data, feature_names, feature_types
|
||||
|
||||
|
||||
DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'}
|
||||
|
||||
DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
|
||||
@@ -384,6 +410,21 @@ def _maybe_dt_array(array):
|
||||
return array
|
||||
|
||||
|
||||
def _convert_dataframes(data, feature_names, feature_types):
|
||||
data, feature_names, feature_types = _maybe_pandas_data(data,
|
||||
feature_names,
|
||||
feature_types)
|
||||
|
||||
data, feature_names, feature_types = _maybe_dt_data(data,
|
||||
feature_names,
|
||||
feature_types)
|
||||
|
||||
data, feature_names, feature_types = _maybe_cudf_dataframe(
|
||||
data, feature_names, feature_types)
|
||||
|
||||
return data, feature_names, feature_types
|
||||
|
||||
|
||||
class DMatrix(object):
|
||||
"""Data Matrix used in XGBoost.
|
||||
|
||||
@@ -404,8 +445,10 @@ class DMatrix(object):
|
||||
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
||||
dt.Frame/cudf.DataFrame
|
||||
Data source of DMatrix.
|
||||
When data is string or os.PathLike type, it represents the path libsvm format
|
||||
txt file, or binary file that xgboost can read from.
|
||||
When data is string or os.PathLike type, it represents the path
|
||||
libsvm format txt file, csv file (by specifying uri parameter
|
||||
'path_to_csv?format=csv'), or binary file that xgboost can read
|
||||
from.
|
||||
label : list, numpy 1-D array or cudf.DataFrame, optional
|
||||
Label of the training data.
|
||||
missing : float, optional
|
||||
@@ -445,13 +488,9 @@ class DMatrix(object):
|
||||
if isinstance(data, list):
|
||||
raise TypeError('Input data can not be a list.')
|
||||
|
||||
data, feature_names, feature_types = _maybe_pandas_data(data,
|
||||
feature_names,
|
||||
feature_types)
|
||||
|
||||
data, feature_names, feature_types = _maybe_dt_data(data,
|
||||
feature_names,
|
||||
feature_types)
|
||||
data, feature_names, feature_types = _convert_dataframes(
|
||||
data, feature_names, feature_types
|
||||
)
|
||||
|
||||
label = _maybe_pandas_label(label)
|
||||
label = _maybe_dt_array(label)
|
||||
@@ -604,7 +643,7 @@ class DMatrix(object):
|
||||
'''Initialize DMatrix from columnar memory format.
|
||||
|
||||
'''
|
||||
interfaces = _extract_interface_from_cudf(df, False)
|
||||
interfaces = _extract_interface_from_cudf(df)
|
||||
handle = ctypes.c_void_p()
|
||||
has_missing = missing is not None
|
||||
missing = missing if has_missing else np.nan
|
||||
@@ -683,7 +722,7 @@ class DMatrix(object):
|
||||
|
||||
def set_interface_info(self, field, data):
|
||||
'''Set info type peoperty into DMatrix.'''
|
||||
interfaces = _extract_interface_from_cudf(data, True)
|
||||
interfaces = _extract_interface_from_cudf(data)
|
||||
_check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
|
||||
c_str(field),
|
||||
interfaces))
|
||||
|
||||
Reference in New Issue
Block a user