diff --git a/include/xgboost/data.h b/include/xgboost/data.h index cd31d01dc..44608e4a1 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -127,6 +127,10 @@ class MetaInfo { * \brief Set information in the meta info with array interface. * \param key The key of the information. * \param interface_str String representation of json format array interface. + * + * [ column_0, column_1, ... column_n ] + * + * Right now only 1 column is permitted. */ void SetInfo(const char* key, std::string const& interface_str); diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index e4a365ab9..bd124fa67 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -133,10 +133,12 @@ except ImportError: try: from cudf import DataFrame as CUDF_DataFrame from cudf import Series as CUDF_Series + from cudf import MultiIndex as CUDF_MultiIndex CUDF_INSTALLED = True except ImportError: CUDF_DataFrame = object CUDF_Series = object + CUDF_MultiIndex = object CUDF_INSTALLED = False # sklearn diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index a7ee5d43f..6a74e0733 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -17,10 +17,11 @@ import json import numpy as np import scipy.sparse -from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str, - PANDAS_INSTALLED, DataTable, - CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, - os_fspath, os_PathLike) +from .compat import ( + STRING_TYPES, PY3, DataFrame, MultiIndex, py_str, + PANDAS_INSTALLED, DataTable, + CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, CUDF_MultiIndex, + os_fspath, os_PathLike) from .libpath import find_lib_path @@ -236,14 +237,18 @@ def c_str(string): def c_array(ctype, values): """Convert a python string to c array.""" - if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype): + if (isinstance(values, np.ndarray) + and values.dtype.itemsize == ctypes.sizeof(ctype)): return (ctype * len(values)).from_buffer_copy(values) return (ctype * len(values))(*values) def _use_columnar_initializer(data): - '''Whether should we use columnar format initializer (pass data in as - json string). Currently cudf is the only valid option.''' + '''Whether should we use columnar format initializer (pass data in as json + string). Currently cudf is the only valid option. For other dataframe + types, use their sepcific API instead. + + ''' if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))): return True return False @@ -258,7 +263,7 @@ def _extract_interface_from_cudf_series(data): return interface -def _extract_interface_from_cudf(df, is_info): +def _extract_interface_from_cudf(df): """This function should be upstreamed to cudf.""" if not _use_columnar_initializer(df): raise ValueError('Only cudf is supported for initializing as json ' + @@ -273,9 +278,6 @@ def _extract_interface_from_cudf(df, is_info): else: array_interfaces.append(_extract_interface_from_cudf_series(df)) - if is_info: - array_interfaces = array_interfaces[0] - interfaces = bytes(json.dumps(array_interfaces, indent=2), 'utf-8') return interfaces @@ -337,6 +339,30 @@ def _maybe_pandas_label(label): return label +def _maybe_cudf_dataframe(data, feature_names, feature_types): + '''Extract internal data from cudf.DataFrame for DMatrix data.''' + if not (CUDF_INSTALLED and isinstance(data, + (CUDF_DataFrame, CUDF_Series))): + return data, feature_names, feature_types + if feature_names is None: + if isinstance(data, CUDF_Series): + feature_names = [data.name] + elif isinstance(data.columns, CUDF_MultiIndex): + feature_names = [ + ' '.join([str(x) for x in i]) + for i in data.columns + ] + else: + feature_names = data.columns.format() + if feature_types is None: + if isinstance(data, CUDF_Series): + dtypes = [data.dtype] + else: + dtypes = data.dtypes + feature_types = [PANDAS_DTYPE_MAPPER[d.name] for d in dtypes] + return data, feature_names, feature_types + + DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'} DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'} @@ -384,6 +410,21 @@ def _maybe_dt_array(array): return array +def _convert_dataframes(data, feature_names, feature_types): + data, feature_names, feature_types = _maybe_pandas_data(data, + feature_names, + feature_types) + + data, feature_names, feature_types = _maybe_dt_data(data, + feature_names, + feature_types) + + data, feature_names, feature_types = _maybe_cudf_dataframe( + data, feature_names, feature_types) + + return data, feature_names, feature_types + + class DMatrix(object): """Data Matrix used in XGBoost. @@ -404,8 +445,10 @@ class DMatrix(object): data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/ dt.Frame/cudf.DataFrame Data source of DMatrix. - When data is string or os.PathLike type, it represents the path libsvm format - txt file, or binary file that xgboost can read from. + When data is string or os.PathLike type, it represents the path + libsvm format txt file, csv file (by specifying uri parameter + 'path_to_csv?format=csv'), or binary file that xgboost can read + from. label : list, numpy 1-D array or cudf.DataFrame, optional Label of the training data. missing : float, optional @@ -445,13 +488,9 @@ class DMatrix(object): if isinstance(data, list): raise TypeError('Input data can not be a list.') - data, feature_names, feature_types = _maybe_pandas_data(data, - feature_names, - feature_types) - - data, feature_names, feature_types = _maybe_dt_data(data, - feature_names, - feature_types) + data, feature_names, feature_types = _convert_dataframes( + data, feature_names, feature_types + ) label = _maybe_pandas_label(label) label = _maybe_dt_array(label) @@ -604,7 +643,7 @@ class DMatrix(object): '''Initialize DMatrix from columnar memory format. ''' - interfaces = _extract_interface_from_cudf(df, False) + interfaces = _extract_interface_from_cudf(df) handle = ctypes.c_void_p() has_missing = missing is not None missing = missing if has_missing else np.nan @@ -683,7 +722,7 @@ class DMatrix(object): def set_interface_info(self, field, data): '''Set info type peoperty into DMatrix.''' - interfaces = _extract_interface_from_cudf(data, True) + interfaces = _extract_interface_from_cudf(data) _check_call(_LIB.XGDMatrixSetInfoFromInterface(self.handle, c_str(field), interfaces)) diff --git a/src/data/data.cu b/src/data/data.cu index 11f968b6a..4c1750700 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -35,8 +35,10 @@ void CopyInfoImpl(std::map const& column, HostDeviceVector(j_arr); + Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); + auto const& j_arr = get(j_interface); + CHECK_EQ(j_arr.size(), 1) << "MetaInfo: " << c_key << ". " << ColumnarErrors::Dimension(1);; + auto const& j_arr_obj = get(j_arr[0]); std::string key {c_key}; ArrayInterfaceHandler::Validate(j_arr_obj); if (j_arr_obj.find("mask") != j_arr_obj.cend()) { diff --git a/tests/cpp/data/test_metainfo.cu b/tests/cpp/data/test_metainfo.cu index 849ac27c3..82e434e78 100644 --- a/tests/cpp/data/test_metainfo.cu +++ b/tests/cpp/data/test_metainfo.cu @@ -31,9 +31,10 @@ std::string PrepareData(std::string typestr, thrust::device_vector* out) { Json(Integer(reinterpret_cast(p_d_data))), Json(Boolean(false))}; column["data"] = j_data; + Json array(std::vector{column}); std::stringstream ss; - Json::Dump(column, &ss); + Json::Dump(array, &ss); std::string str = ss.str(); return str; diff --git a/tests/python-gpu/test_from_columnar.py b/tests/python-gpu/test_from_columnar.py index fe993533e..53fdfcc10 100644 --- a/tests/python-gpu/test_from_columnar.py +++ b/tests/python-gpu/test_from_columnar.py @@ -42,6 +42,7 @@ Arrow specification.''' @pytest.mark.skipif(**tm.no_cudf()) def test_from_cudf(self): '''Test constructing DMatrix from cudf''' + import cudf dmatrix_from_cudf(np.float32, np.NAN) dmatrix_from_cudf(np.float64, np.NAN) @@ -52,3 +53,19 @@ Arrow specification.''' dmatrix_from_cudf(np.int8, 2) dmatrix_from_cudf(np.int32, -2) dmatrix_from_cudf(np.int64, -3) + + cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]}) + dtrain = xgb.DMatrix(cd) + + assert dtrain.feature_names == ['x', 'y'] + assert dtrain.feature_types == ['int', 'float'] + + series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0] + assert isinstance(series, cudf.Series) + dtrain = xgb.DMatrix(series) + + assert dtrain.feature_names == ['x'] + assert dtrain.feature_types == ['int'] + + with pytest.raises(Exception): + dtrain = xgb.DMatrix(cd, label=cd)