Support feature names/types for cudf. (#4902)

* Implement most of the pandas procedure for cudf except for type conversion.
* Requires an array of interfaces in metainfo.
This commit is contained in:
Jiaming Yuan 2019-09-29 15:07:51 -04:00 committed by GitHub
parent 2fa8b359e0
commit d30e63a0a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 90 additions and 25 deletions

View File

@ -127,6 +127,10 @@ class MetaInfo {
* \brief Set information in the meta info with array interface.
* \param key The key of the information.
* \param interface_str String representation of json format array interface.
*
* [ column_0, column_1, ... column_n ]
*
* Right now only 1 column is permitted.
*/
void SetInfo(const char* key, std::string const& interface_str);

View File

@ -133,10 +133,12 @@ except ImportError:
try:
from cudf import DataFrame as CUDF_DataFrame
from cudf import Series as CUDF_Series
from cudf import MultiIndex as CUDF_MultiIndex
CUDF_INSTALLED = True
except ImportError:
CUDF_DataFrame = object
CUDF_Series = object
CUDF_MultiIndex = object
CUDF_INSTALLED = False
# sklearn

View File

@ -17,9 +17,10 @@ import json
import numpy as np
import scipy.sparse
from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
from .compat import (
STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
PANDAS_INSTALLED, DataTable,
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, CUDF_MultiIndex,
os_fspath, os_PathLike)
from .libpath import find_lib_path
@ -236,14 +237,18 @@ def c_str(string):
def c_array(ctype, values):
"""Convert a python string to c array."""
if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
if (isinstance(values, np.ndarray)
and values.dtype.itemsize == ctypes.sizeof(ctype)):
return (ctype * len(values)).from_buffer_copy(values)
return (ctype * len(values))(*values)
def _use_columnar_initializer(data):
'''Whether should we use columnar format initializer (pass data in as
json string). Currently cudf is the only valid option.'''
'''Whether should we use columnar format initializer (pass data in as json
string). Currently cudf is the only valid option. For other dataframe
types, use their sepcific API instead.
'''
if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
return True
return False
@ -258,7 +263,7 @@ def _extract_interface_from_cudf_series(data):
return interface
def _extract_interface_from_cudf(df, is_info):
def _extract_interface_from_cudf(df):
"""This function should be upstreamed to cudf."""
if not _use_columnar_initializer(df):
raise ValueError('Only cudf is supported for initializing as json ' +
@ -273,9 +278,6 @@ def _extract_interface_from_cudf(df, is_info):
else:
array_interfaces.append(_extract_interface_from_cudf_series(df))
if is_info:
array_interfaces = array_interfaces[0]
interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
return interfaces
@ -337,6 +339,30 @@ def _maybe_pandas_label(label):
return label
def _maybe_cudf_dataframe(data, feature_names, feature_types):
'''Extract internal data from cudf.DataFrame for DMatrix data.'''
if not (CUDF_INSTALLED and isinstance(data,
(CUDF_DataFrame, CUDF_Series))):
return data, feature_names, feature_types
if feature_names is None:
if isinstance(data, CUDF_Series):
feature_names = [data.name]
elif isinstance(data.columns, CUDF_MultiIndex):
feature_names = [
' '.join([str(x) for x in i])
for i in data.columns
]
else:
feature_names = data.columns.format()
if feature_types is None:
if isinstance(data, CUDF_Series):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
feature_types = [PANDAS_DTYPE_MAPPER[d.name] for d in dtypes]
return data, feature_names, feature_types
DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'}
DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
@ -384,6 +410,21 @@ def _maybe_dt_array(array):
return array
def _convert_dataframes(data, feature_names, feature_types):
data, feature_names, feature_types = _maybe_pandas_data(data,
feature_names,
feature_types)
data, feature_names, feature_types = _maybe_dt_data(data,
feature_names,
feature_types)
data, feature_names, feature_types = _maybe_cudf_dataframe(
data, feature_names, feature_types)
return data, feature_names, feature_types
class DMatrix(object):
"""Data Matrix used in XGBoost.
@ -404,8 +445,10 @@ class DMatrix(object):
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
dt.Frame/cudf.DataFrame
Data source of DMatrix.
When data is string or os.PathLike type, it represents the path libsvm format
txt file, or binary file that xgboost can read from.
When data is string or os.PathLike type, it represents the path
libsvm format txt file, csv file (by specifying uri parameter
'path_to_csv?format=csv'), or binary file that xgboost can read
from.
label : list, numpy 1-D array or cudf.DataFrame, optional
Label of the training data.
missing : float, optional
@ -445,13 +488,9 @@ class DMatrix(object):
if isinstance(data, list):
raise TypeError('Input data can not be a list.')
data, feature_names, feature_types = _maybe_pandas_data(data,
feature_names,
feature_types)
data, feature_names, feature_types = _maybe_dt_data(data,
feature_names,
feature_types)
data, feature_names, feature_types = _convert_dataframes(
data, feature_names, feature_types
)
label = _maybe_pandas_label(label)
label = _maybe_dt_array(label)
@ -604,7 +643,7 @@ class DMatrix(object):
'''Initialize DMatrix from columnar memory format.
'''
interfaces = _extract_interface_from_cudf(df, False)
interfaces = _extract_interface_from_cudf(df)
handle = ctypes.c_void_p()
has_missing = missing is not None
missing = missing if has_missing else np.nan
@ -683,7 +722,7 @@ class DMatrix(object):
def set_interface_info(self, field, data):
'''Set info type peoperty into DMatrix.'''
interfaces = _extract_interface_from_cudf(data, True)
interfaces = _extract_interface_from_cudf(data)
_check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
c_str(field),
interfaces))

View File

@ -35,8 +35,10 @@ void CopyInfoImpl(std::map<std::string, Json> const& column, HostDeviceVector<fl
}
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
Json j_arr = Json::Load({interface_str.c_str(), interface_str.size()});
auto const& j_arr_obj = get<Object>(j_arr);
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
auto const& j_arr = get<Array>(j_interface);
CHECK_EQ(j_arr.size(), 1) << "MetaInfo: " << c_key << ". " << ColumnarErrors::Dimension(1);;
auto const& j_arr_obj = get<Object const>(j_arr[0]);
std::string key {c_key};
ArrayInterfaceHandler::Validate(j_arr_obj);
if (j_arr_obj.find("mask") != j_arr_obj.cend()) {

View File

@ -31,9 +31,10 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out) {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
column["data"] = j_data;
Json array(std::vector<Json>{column});
std::stringstream ss;
Json::Dump(column, &ss);
Json::Dump(array, &ss);
std::string str = ss.str();
return str;

View File

@ -42,6 +42,7 @@ Arrow specification.'''
@pytest.mark.skipif(**tm.no_cudf())
def test_from_cudf(self):
'''Test constructing DMatrix from cudf'''
import cudf
dmatrix_from_cudf(np.float32, np.NAN)
dmatrix_from_cudf(np.float64, np.NAN)
@ -52,3 +53,19 @@ Arrow specification.'''
dmatrix_from_cudf(np.int8, 2)
dmatrix_from_cudf(np.int32, -2)
dmatrix_from_cudf(np.int64, -3)
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
dtrain = xgb.DMatrix(cd)
assert dtrain.feature_names == ['x', 'y']
assert dtrain.feature_types == ['int', 'float']
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
assert isinstance(series, cudf.Series)
dtrain = xgb.DMatrix(series)
assert dtrain.feature_names == ['x']
assert dtrain.feature_types == ['int']
with pytest.raises(Exception):
dtrain = xgb.DMatrix(cd, label=cd)