Add support for cudf.Series (#4891)

This commit is contained in:
Vibhu Jawa 2019-09-25 20:52:28 -07:00 committed by Jiaming Yuan
parent 82ee2317e8
commit 2fa8b359e0
3 changed files with 23 additions and 12 deletions

View File

@ -132,9 +132,11 @@ except ImportError:
try:
from cudf import DataFrame as CUDF_DataFrame
from cudf import Series as CUDF_Series
CUDF_INSTALLED = True
except ImportError:
CUDF_DataFrame = object
CUDF_Series = object
CUDF_INSTALLED = False
# sklearn

View File

@ -19,7 +19,7 @@ import scipy.sparse
from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
PANDAS_INSTALLED, DataTable,
CUDF_INSTALLED, CUDF_DataFrame,
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
os_fspath, os_PathLike)
from .libpath import find_lib_path
@ -243,26 +243,35 @@ def c_array(ctype, values):
def _use_columnar_initializer(data):
'''Whether should we use columnar format initializer (pass data in as
json string). Currently cudf is the only valid option.'''
if CUDF_INSTALLED and isinstance(data, CUDF_DataFrame):
json string). Currently cudf is the only valid option.'''
if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
return True
return False
def _extract_interface_from_cudf_series(data):
"""This returns the array interface from the cudf series. This function should
be upstreamed to cudf."""
interface = data.__cuda_array_interface__
if data.has_null_mask:
interface['mask'] = interface['mask'].__cuda_array_interface__
return interface
def _extract_interface_from_cudf(df, is_info):
'''This function should be upstreamed to cudf.'''
"""This function should be upstreamed to cudf."""
if not _use_columnar_initializer(df):
raise ValueError('Only cudf is supported for initializing as json ' +
'columnar format. For other libraries please ' +
'refer to specific API.')
array_interfaces = []
for col in df.columns:
data = df[col]
interface = data.__cuda_array_interface__
if data.has_null_mask:
interface['mask'] = interface['mask'].__cuda_array_interface__
array_interfaces.append(interface)
if isinstance(df, CUDF_DataFrame):
for col in df.columns:
array_interfaces.append(
_extract_interface_from_cudf_series(df[col]))
else:
array_interfaces.append(_extract_interface_from_cudf_series(df))
if is_info:
array_interfaces = array_interfaces[0]

View File

@ -27,8 +27,8 @@ def dmatrix_from_cudf(input_type, missing=np.NAN):
np_label = np.random.randn(kRows).astype(input_type)
pa_label = pd.DataFrame(np_label)
cd: cudf.DataFrame = cudf.from_pandas(pa)
cd_label: cudf.DataFrame = cudf.from_pandas(pa_label)
cd = cudf.from_pandas(pa)
cd_label = cudf.from_pandas(pa_label).iloc[:, 0]
dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label)
assert dtrain.num_col() == kCols