Modin DF support (#6055)

* Modin DF support

* mode change

* tests were added, ci env was extended

* mode change

* Remove redundant installation of modin

* Add a pytest skip marker for modin

* Install Modin[ray] from PyPI

* fix interfering

* avoid extra conversion

* delete cv test for modin

* revert cv function

Co-authored-by: ShvetsKS <kirill.shvets@intel.com>
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
ShvetsKS
2020-08-29 22:33:30 +03:00
committed by GitHub
parent 3a990433f9
commit c1ca872d1e
5 changed files with 187 additions and 2 deletions

View File

@@ -151,6 +151,13 @@ def _is_pandas_df(data):
return False
return isinstance(data, pd.DataFrame)
def _is_modin_df(data):
try:
import modin.pandas as pd
except ImportError:
return False
return isinstance(data, pd.DataFrame)
_pandas_dtype_mapper = {
'int8': 'int',
@@ -208,8 +215,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
'DataFrame for {meta} cannot have multiple columns'.format(
meta=meta))
dtype = meta_type if meta_type else 'float'
data = data.values.astype(dtype)
dtype = meta_type if meta_type else np.float32
data = np.ascontiguousarray(data.values, dtype=dtype)
return data, feature_names, feature_types
@@ -228,6 +235,13 @@ def _is_pandas_series(data):
return False
return isinstance(data, pd.Series)
def _is_modin_series(data):
try:
import modin.pandas as pd
except ImportError:
return False
return isinstance(data, pd.Series)
def _from_pandas_series(data, missing, nthread, feature_types, feature_names):
return _from_numpy_array(data.values.astype('float'), missing, nthread,
@@ -525,6 +539,12 @@ def dispatch_data_backend(data, missing, threads,
_warn_unused_missing(data, missing)
return _from_dt_df(data, missing, threads, feature_names,
feature_types)
if _is_modin_df(data):
return _from_pandas_df(data, missing, threads,
feature_names, feature_types)
if _is_modin_series(data):
return _from_pandas_series(data, missing, threads, feature_names,
feature_types)
if _has_array_protocol(data):
pass
raise TypeError('Not supported type for data.' + str(type(data)))
@@ -648,6 +668,15 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
if _is_dt_df(data):
_meta_from_dt(data, name, dtype, handle)
return
if _is_modin_df(data):
data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _is_modin_series(data):
data = data.values.astype('float')
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
return
if _has_array_protocol(data):
pass
raise TypeError('Unsupported type for ' + name, str(type(data)))