Support feature names/types for cudf. (#4902)
* Implement most of the pandas procedure for cudf except for type conversion. * Requires an array of interfaces in metainfo.
This commit is contained in:
parent
2fa8b359e0
commit
d30e63a0a5
@ -127,6 +127,10 @@ class MetaInfo {
|
|||||||
* \brief Set information in the meta info with array interface.
|
* \brief Set information in the meta info with array interface.
|
||||||
* \param key The key of the information.
|
* \param key The key of the information.
|
||||||
* \param interface_str String representation of json format array interface.
|
* \param interface_str String representation of json format array interface.
|
||||||
|
*
|
||||||
|
* [ column_0, column_1, ... column_n ]
|
||||||
|
*
|
||||||
|
* Right now only 1 column is permitted.
|
||||||
*/
|
*/
|
||||||
void SetInfo(const char* key, std::string const& interface_str);
|
void SetInfo(const char* key, std::string const& interface_str);
|
||||||
|
|
||||||
|
|||||||
@ -133,10 +133,12 @@ except ImportError:
|
|||||||
try:
|
try:
|
||||||
from cudf import DataFrame as CUDF_DataFrame
|
from cudf import DataFrame as CUDF_DataFrame
|
||||||
from cudf import Series as CUDF_Series
|
from cudf import Series as CUDF_Series
|
||||||
|
from cudf import MultiIndex as CUDF_MultiIndex
|
||||||
CUDF_INSTALLED = True
|
CUDF_INSTALLED = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
CUDF_DataFrame = object
|
CUDF_DataFrame = object
|
||||||
CUDF_Series = object
|
CUDF_Series = object
|
||||||
|
CUDF_MultiIndex = object
|
||||||
CUDF_INSTALLED = False
|
CUDF_INSTALLED = False
|
||||||
|
|
||||||
# sklearn
|
# sklearn
|
||||||
|
|||||||
@ -17,10 +17,11 @@ import json
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.sparse
|
import scipy.sparse
|
||||||
|
|
||||||
from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
|
from .compat import (
|
||||||
PANDAS_INSTALLED, DataTable,
|
STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
|
||||||
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
|
PANDAS_INSTALLED, DataTable,
|
||||||
os_fspath, os_PathLike)
|
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, CUDF_MultiIndex,
|
||||||
|
os_fspath, os_PathLike)
|
||||||
from .libpath import find_lib_path
|
from .libpath import find_lib_path
|
||||||
|
|
||||||
|
|
||||||
@ -236,14 +237,18 @@ def c_str(string):
|
|||||||
|
|
||||||
def c_array(ctype, values):
|
def c_array(ctype, values):
|
||||||
"""Convert a python string to c array."""
|
"""Convert a python string to c array."""
|
||||||
if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
|
if (isinstance(values, np.ndarray)
|
||||||
|
and values.dtype.itemsize == ctypes.sizeof(ctype)):
|
||||||
return (ctype * len(values)).from_buffer_copy(values)
|
return (ctype * len(values)).from_buffer_copy(values)
|
||||||
return (ctype * len(values))(*values)
|
return (ctype * len(values))(*values)
|
||||||
|
|
||||||
|
|
||||||
def _use_columnar_initializer(data):
|
def _use_columnar_initializer(data):
|
||||||
'''Whether should we use columnar format initializer (pass data in as
|
'''Whether should we use columnar format initializer (pass data in as json
|
||||||
json string). Currently cudf is the only valid option.'''
|
string). Currently cudf is the only valid option. For other dataframe
|
||||||
|
types, use their sepcific API instead.
|
||||||
|
|
||||||
|
'''
|
||||||
if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
|
if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@ -258,7 +263,7 @@ def _extract_interface_from_cudf_series(data):
|
|||||||
return interface
|
return interface
|
||||||
|
|
||||||
|
|
||||||
def _extract_interface_from_cudf(df, is_info):
|
def _extract_interface_from_cudf(df):
|
||||||
"""This function should be upstreamed to cudf."""
|
"""This function should be upstreamed to cudf."""
|
||||||
if not _use_columnar_initializer(df):
|
if not _use_columnar_initializer(df):
|
||||||
raise ValueError('Only cudf is supported for initializing as json ' +
|
raise ValueError('Only cudf is supported for initializing as json ' +
|
||||||
@ -273,9 +278,6 @@ def _extract_interface_from_cudf(df, is_info):
|
|||||||
else:
|
else:
|
||||||
array_interfaces.append(_extract_interface_from_cudf_series(df))
|
array_interfaces.append(_extract_interface_from_cudf_series(df))
|
||||||
|
|
||||||
if is_info:
|
|
||||||
array_interfaces = array_interfaces[0]
|
|
||||||
|
|
||||||
interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
|
interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8')
|
||||||
return interfaces
|
return interfaces
|
||||||
|
|
||||||
@ -337,6 +339,30 @@ def _maybe_pandas_label(label):
|
|||||||
return label
|
return label
|
||||||
|
|
||||||
|
|
||||||
|
def _maybe_cudf_dataframe(data, feature_names, feature_types):
|
||||||
|
'''Extract internal data from cudf.DataFrame for DMatrix data.'''
|
||||||
|
if not (CUDF_INSTALLED and isinstance(data,
|
||||||
|
(CUDF_DataFrame, CUDF_Series))):
|
||||||
|
return data, feature_names, feature_types
|
||||||
|
if feature_names is None:
|
||||||
|
if isinstance(data, CUDF_Series):
|
||||||
|
feature_names = [data.name]
|
||||||
|
elif isinstance(data.columns, CUDF_MultiIndex):
|
||||||
|
feature_names = [
|
||||||
|
' '.join([str(x) for x in i])
|
||||||
|
for i in data.columns
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
feature_names = data.columns.format()
|
||||||
|
if feature_types is None:
|
||||||
|
if isinstance(data, CUDF_Series):
|
||||||
|
dtypes = [data.dtype]
|
||||||
|
else:
|
||||||
|
dtypes = data.dtypes
|
||||||
|
feature_types = [PANDAS_DTYPE_MAPPER[d.name] for d in dtypes]
|
||||||
|
return data, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'}
|
DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'}
|
||||||
|
|
||||||
DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
|
DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
|
||||||
@ -384,6 +410,21 @@ def _maybe_dt_array(array):
|
|||||||
return array
|
return array
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_dataframes(data, feature_names, feature_types):
|
||||||
|
data, feature_names, feature_types = _maybe_pandas_data(data,
|
||||||
|
feature_names,
|
||||||
|
feature_types)
|
||||||
|
|
||||||
|
data, feature_names, feature_types = _maybe_dt_data(data,
|
||||||
|
feature_names,
|
||||||
|
feature_types)
|
||||||
|
|
||||||
|
data, feature_names, feature_types = _maybe_cudf_dataframe(
|
||||||
|
data, feature_names, feature_types)
|
||||||
|
|
||||||
|
return data, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
class DMatrix(object):
|
class DMatrix(object):
|
||||||
"""Data Matrix used in XGBoost.
|
"""Data Matrix used in XGBoost.
|
||||||
|
|
||||||
@ -404,8 +445,10 @@ class DMatrix(object):
|
|||||||
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
||||||
dt.Frame/cudf.DataFrame
|
dt.Frame/cudf.DataFrame
|
||||||
Data source of DMatrix.
|
Data source of DMatrix.
|
||||||
When data is string or os.PathLike type, it represents the path libsvm format
|
When data is string or os.PathLike type, it represents the path
|
||||||
txt file, or binary file that xgboost can read from.
|
libsvm format txt file, csv file (by specifying uri parameter
|
||||||
|
'path_to_csv?format=csv'), or binary file that xgboost can read
|
||||||
|
from.
|
||||||
label : list, numpy 1-D array or cudf.DataFrame, optional
|
label : list, numpy 1-D array or cudf.DataFrame, optional
|
||||||
Label of the training data.
|
Label of the training data.
|
||||||
missing : float, optional
|
missing : float, optional
|
||||||
@ -445,13 +488,9 @@ class DMatrix(object):
|
|||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
raise TypeError('Input data can not be a list.')
|
raise TypeError('Input data can not be a list.')
|
||||||
|
|
||||||
data, feature_names, feature_types = _maybe_pandas_data(data,
|
data, feature_names, feature_types = _convert_dataframes(
|
||||||
feature_names,
|
data, feature_names, feature_types
|
||||||
feature_types)
|
)
|
||||||
|
|
||||||
data, feature_names, feature_types = _maybe_dt_data(data,
|
|
||||||
feature_names,
|
|
||||||
feature_types)
|
|
||||||
|
|
||||||
label = _maybe_pandas_label(label)
|
label = _maybe_pandas_label(label)
|
||||||
label = _maybe_dt_array(label)
|
label = _maybe_dt_array(label)
|
||||||
@ -604,7 +643,7 @@ class DMatrix(object):
|
|||||||
'''Initialize DMatrix from columnar memory format.
|
'''Initialize DMatrix from columnar memory format.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
interfaces = _extract_interface_from_cudf(df, False)
|
interfaces = _extract_interface_from_cudf(df)
|
||||||
handle = ctypes.c_void_p()
|
handle = ctypes.c_void_p()
|
||||||
has_missing = missing is not None
|
has_missing = missing is not None
|
||||||
missing = missing if has_missing else np.nan
|
missing = missing if has_missing else np.nan
|
||||||
@ -683,7 +722,7 @@ class DMatrix(object):
|
|||||||
|
|
||||||
def set_interface_info(self, field, data):
|
def set_interface_info(self, field, data):
|
||||||
'''Set info type peoperty into DMatrix.'''
|
'''Set info type peoperty into DMatrix.'''
|
||||||
interfaces = _extract_interface_from_cudf(data, True)
|
interfaces = _extract_interface_from_cudf(data)
|
||||||
_check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
|
_check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle,
|
||||||
c_str(field),
|
c_str(field),
|
||||||
interfaces))
|
interfaces))
|
||||||
|
|||||||
@ -35,8 +35,10 @@ void CopyInfoImpl(std::map<std::string, Json> const& column, HostDeviceVector<fl
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
||||||
Json j_arr = Json::Load({interface_str.c_str(), interface_str.size()});
|
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
|
||||||
auto const& j_arr_obj = get<Object>(j_arr);
|
auto const& j_arr = get<Array>(j_interface);
|
||||||
|
CHECK_EQ(j_arr.size(), 1) << "MetaInfo: " << c_key << ". " << ColumnarErrors::Dimension(1);;
|
||||||
|
auto const& j_arr_obj = get<Object const>(j_arr[0]);
|
||||||
std::string key {c_key};
|
std::string key {c_key};
|
||||||
ArrayInterfaceHandler::Validate(j_arr_obj);
|
ArrayInterfaceHandler::Validate(j_arr_obj);
|
||||||
if (j_arr_obj.find("mask") != j_arr_obj.cend()) {
|
if (j_arr_obj.find("mask") != j_arr_obj.cend()) {
|
||||||
|
|||||||
@ -31,9 +31,10 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out) {
|
|||||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||||
Json(Boolean(false))};
|
Json(Boolean(false))};
|
||||||
column["data"] = j_data;
|
column["data"] = j_data;
|
||||||
|
Json array(std::vector<Json>{column});
|
||||||
|
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
Json::Dump(column, &ss);
|
Json::Dump(array, &ss);
|
||||||
std::string str = ss.str();
|
std::string str = ss.str();
|
||||||
|
|
||||||
return str;
|
return str;
|
||||||
|
|||||||
@ -42,6 +42,7 @@ Arrow specification.'''
|
|||||||
@pytest.mark.skipif(**tm.no_cudf())
|
@pytest.mark.skipif(**tm.no_cudf())
|
||||||
def test_from_cudf(self):
|
def test_from_cudf(self):
|
||||||
'''Test constructing DMatrix from cudf'''
|
'''Test constructing DMatrix from cudf'''
|
||||||
|
import cudf
|
||||||
dmatrix_from_cudf(np.float32, np.NAN)
|
dmatrix_from_cudf(np.float32, np.NAN)
|
||||||
dmatrix_from_cudf(np.float64, np.NAN)
|
dmatrix_from_cudf(np.float64, np.NAN)
|
||||||
|
|
||||||
@ -52,3 +53,19 @@ Arrow specification.'''
|
|||||||
dmatrix_from_cudf(np.int8, 2)
|
dmatrix_from_cudf(np.int8, 2)
|
||||||
dmatrix_from_cudf(np.int32, -2)
|
dmatrix_from_cudf(np.int32, -2)
|
||||||
dmatrix_from_cudf(np.int64, -3)
|
dmatrix_from_cudf(np.int64, -3)
|
||||||
|
|
||||||
|
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
|
||||||
|
dtrain = xgb.DMatrix(cd)
|
||||||
|
|
||||||
|
assert dtrain.feature_names == ['x', 'y']
|
||||||
|
assert dtrain.feature_types == ['int', 'float']
|
||||||
|
|
||||||
|
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
|
||||||
|
assert isinstance(series, cudf.Series)
|
||||||
|
dtrain = xgb.DMatrix(series)
|
||||||
|
|
||||||
|
assert dtrain.feature_names == ['x']
|
||||||
|
assert dtrain.feature_types == ['int']
|
||||||
|
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
dtrain = xgb.DMatrix(cd, label=cd)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user