Add support for cudf.Series (#4891)

This commit is contained in:
Vibhu Jawa 2019-09-25 20:52:28 -07:00 committed by Jiaming Yuan
parent 82ee2317e8
commit 2fa8b359e0
3 changed files with 23 additions and 12 deletions

View File

@ -132,9 +132,11 @@ except ImportError:
try: try:
from cudf import DataFrame as CUDF_DataFrame from cudf import DataFrame as CUDF_DataFrame
from cudf import Series as CUDF_Series
CUDF_INSTALLED = True CUDF_INSTALLED = True
except ImportError: except ImportError:
CUDF_DataFrame = object CUDF_DataFrame = object
CUDF_Series = object
CUDF_INSTALLED = False CUDF_INSTALLED = False
# sklearn # sklearn

View File

@ -19,7 +19,7 @@ import scipy.sparse
from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str, from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
PANDAS_INSTALLED, DataTable, PANDAS_INSTALLED, DataTable,
CUDF_INSTALLED, CUDF_DataFrame, CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
os_fspath, os_PathLike) os_fspath, os_PathLike)
from .libpath import find_lib_path from .libpath import find_lib_path
@ -243,26 +243,35 @@ def c_array(ctype, values):
def _use_columnar_initializer(data): def _use_columnar_initializer(data):
'''Whether should we use columnar format initializer (pass data in as '''Whether should we use columnar format initializer (pass data in as
json string). Currently cudf is the only valid option.''' json string). Currently cudf is the only valid option.'''
if CUDF_INSTALLED and isinstance(data, CUDF_DataFrame): if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
return True return True
return False return False
def _extract_interface_from_cudf_series(data):
"""This returns the array interface from the cudf series. This function should
be upstreamed to cudf."""
interface = data.__cuda_array_interface__
if data.has_null_mask:
interface['mask'] = interface['mask'].__cuda_array_interface__
return interface
def _extract_interface_from_cudf(df, is_info): def _extract_interface_from_cudf(df, is_info):
'''This function should be upstreamed to cudf.''' """This function should be upstreamed to cudf."""
if not _use_columnar_initializer(df): if not _use_columnar_initializer(df):
raise ValueError('Only cudf is supported for initializing as json ' + raise ValueError('Only cudf is supported for initializing as json ' +
'columnar format. For other libraries please ' + 'columnar format. For other libraries please ' +
'refer to specific API.') 'refer to specific API.')
array_interfaces = [] array_interfaces = []
for col in df.columns: if isinstance(df, CUDF_DataFrame):
data = df[col] for col in df.columns:
interface = data.__cuda_array_interface__ array_interfaces.append(
if data.has_null_mask: _extract_interface_from_cudf_series(df[col]))
interface['mask'] = interface['mask'].__cuda_array_interface__ else:
array_interfaces.append(interface) array_interfaces.append(_extract_interface_from_cudf_series(df))
if is_info: if is_info:
array_interfaces = array_interfaces[0] array_interfaces = array_interfaces[0]

View File

@ -27,8 +27,8 @@ def dmatrix_from_cudf(input_type, missing=np.NAN):
np_label = np.random.randn(kRows).astype(input_type) np_label = np.random.randn(kRows).astype(input_type)
pa_label = pd.DataFrame(np_label) pa_label = pd.DataFrame(np_label)
cd: cudf.DataFrame = cudf.from_pandas(pa) cd = cudf.from_pandas(pa)
cd_label: cudf.DataFrame = cudf.from_pandas(pa_label) cd_label = cudf.from_pandas(pa_label).iloc[:, 0]
dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label) dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label)
assert dtrain.num_col() == kCols assert dtrain.num_col() == kCols