Fix dask prediction. (#4941)

* Fix dask prediction.

* Add better error messages for wrong partition.
This commit is contained in:
Jiaming Yuan
2019-10-14 23:19:34 -04:00
committed by GitHub
parent b61d534472
commit 2ebdec8aa6
5 changed files with 51 additions and 24 deletions

View File

@@ -139,13 +139,14 @@ class DaskDMatrix:
self._missing = missing
if len(data.shape) != 2:
_expect('2 dimensions input', data.shape)
raise ValueError(
'Expecting 2 dimensional input, got: {shape}'.format(
shape=data.shape))
if not any(isinstance(data, t) for t in (dd.DataFrame, da.Array)):
if not isinstance(data, (dd.DataFrame, da.Array)):
raise TypeError(_expect((dd.DataFrame, da.Array), type(data)))
if not any(
isinstance(label, t)
for t in (dd.DataFrame, da.Array, dd.Series, type(None))):
if not isinstance(label, (dd.DataFrame, da.Array, dd.Series,
type(None))):
raise TypeError(
_expect((dd.DataFrame, da.Array, dd.Series), type(label)))
@@ -158,6 +159,23 @@ class DaskDMatrix:
async def map_local_data(self, client, data, label=None, weights=None):
'''Obtain references to local data.'''
def inconsistent(left, left_name, right, right_name):
msg = 'Partitions between {a_name} and {b_name} are not ' \
'consistent: {a_len} != {b_len}. ' \
'Please try to repartition/rechunk your data.'.format(
a_name=left_name, b_name=right_name, a_len=len(left),
b_len=len(right)
)
return msg
def check_columns(parts):
# x is required to be 2 dim in __init__
assert parts.ndim == 1 or parts.shape[1], 'Data should be' \
' partitioned by row. To avoid this specify the number' \
' of columns for your dask Array explicitly. e.g.' \
' chunks=(partition_size, X.shape[1])'
data = data.persist()
if label is not None:
label = label.persist()
@@ -169,28 +187,28 @@ class DaskDMatrix:
# equivalents.
X_parts = data.to_delayed()
if isinstance(X_parts, numpy.ndarray):
assert X_parts.shape[1] == 1
check_columns(X_parts)
X_parts = X_parts.flatten().tolist()
if label is not None:
y_parts = label.to_delayed()
if isinstance(y_parts, numpy.ndarray):
assert y_parts.ndim == 1 or y_parts.shape[1] == 1
check_columns(y_parts)
y_parts = y_parts.flatten().tolist()
if weights is not None:
w_parts = weights.to_delayed()
if isinstance(w_parts, numpy.ndarray):
assert w_parts.ndim == 1 or w_parts.shape[1] == 1
check_columns(w_parts)
w_parts = w_parts.flatten().tolist()
parts = [X_parts]
if label is not None:
assert len(X_parts) == len(
y_parts), 'Partitions between X and y are not consistent'
y_parts), inconsistent(X_parts, 'X', y_parts, 'labels')
parts.append(y_parts)
if weights is not None:
assert len(X_parts) == len(
w_parts), 'Partitions between X and weight are not consistent.'
w_parts), inconsistent(X_parts, 'X', w_parts, 'weights')
parts.append(w_parts)
parts = list(map(delayed, zip(*parts)))
@@ -275,7 +293,11 @@ class DaskDMatrix:
cols = 0
for shape in shapes:
rows += shape[0]
cols += shape[1]
c = shape[1]
assert cols in (0, c), 'Shape between partitions are not the' \
' same. Got: {left} and {right}'.format(left=c, right=cols)
cols = c
return (rows, cols)