Implement Python data handler. (#5689)
* Define data handlers for DMatrix. * Throw ValueError in scikit learn interface.
This commit is contained in:
@@ -22,6 +22,16 @@ def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
|
||||
dtrain = DMatrixT(X, missing=missing, label=y)
|
||||
assert dtrain.num_col() == kCols
|
||||
assert dtrain.num_row() == kRows
|
||||
|
||||
if DMatrixT is xgb.DeviceQuantileDMatrix:
|
||||
# Slice is not supported by DeviceQuantileDMatrix
|
||||
with pytest.raises(xgb.core.XGBoostError):
|
||||
dtrain.slice(rindex=[0, 1, 2])
|
||||
dtrain.slice(rindex=[0, 1, 2])
|
||||
else:
|
||||
dtrain.slice(rindex=[0, 1, 2])
|
||||
dtrain.slice(rindex=[0, 1, 2])
|
||||
|
||||
return dtrain
|
||||
|
||||
|
||||
@@ -41,7 +51,7 @@ def _test_from_cupy(DMatrixT):
|
||||
|
||||
with pytest.raises(Exception):
|
||||
X = cp.random.randn(2, 2, dtype="float32")
|
||||
dtrain = DMatrixT(X, label=X)
|
||||
DMatrixT(X, label=X)
|
||||
|
||||
|
||||
def _test_cupy_training(DMatrixT):
|
||||
@@ -88,11 +98,14 @@ def _test_cupy_metainfo(DMatrixT):
|
||||
dmat_cupy.set_interface_info('group', cupy_uints)
|
||||
|
||||
# Test setting info with cupy
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('weight'),
|
||||
dmat_cupy.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'),
|
||||
dmat_cupy.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cupy.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'),
|
||||
dmat_cupy.get_uint_info('group_ptr'))
|
||||
|
||||
|
||||
class TestFromCupy:
|
||||
@@ -135,7 +148,9 @@ Arrow specification.'''
|
||||
import cupy as cp
|
||||
n = 100
|
||||
X = cp.random.random((n, 2))
|
||||
xgb.DeviceQuantileDMatrix(X.toDlpack())
|
||||
m = xgb.DeviceQuantileDMatrix(X.toDlpack())
|
||||
with pytest.raises(xgb.core.XGBoostError):
|
||||
m.slice(rindex=[0, 1, 2])
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
@pytest.mark.mgpu
|
||||
|
||||
@@ -67,7 +67,8 @@ class TestPandas(unittest.TestCase):
|
||||
# 0 1 1 0 0
|
||||
# 1 2 0 1 0
|
||||
# 2 3 0 0 1
|
||||
result, _, _ = xgb.core._maybe_pandas_data(dummies, None, None)
|
||||
pandas_handler = xgb.data.PandasHandler(np.nan, 0, False)
|
||||
result, _, _ = pandas_handler._maybe_pandas_data(dummies, None, None)
|
||||
exp = np.array([[1., 1., 0., 0.],
|
||||
[2., 0., 1., 0.],
|
||||
[3., 0., 0., 1.]])
|
||||
@@ -113,12 +114,12 @@ class TestPandas(unittest.TestCase):
|
||||
import pandas as pd
|
||||
rows = 100
|
||||
X = pd.DataFrame(
|
||||
{"A": pd.SparseArray(np.random.randint(0, 10, size=rows)),
|
||||
"B": pd.SparseArray(np.random.randn(rows)),
|
||||
"C": pd.SparseArray(np.random.permutation(
|
||||
{"A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)),
|
||||
"B": pd.arrays.SparseArray(np.random.randn(rows)),
|
||||
"C": pd.arrays.SparseArray(np.random.permutation(
|
||||
[True, False] * (rows // 2)))}
|
||||
)
|
||||
y = pd.Series(pd.SparseArray(np.random.randn(rows)))
|
||||
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
|
||||
dtrain = xgb.DMatrix(X, y)
|
||||
booster = xgb.train({}, dtrain, num_boost_round=4)
|
||||
predt_sparse = booster.predict(xgb.DMatrix(X))
|
||||
@@ -128,17 +129,18 @@ class TestPandas(unittest.TestCase):
|
||||
def test_pandas_label(self):
|
||||
# label must be a single column
|
||||
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||
self.assertRaises(ValueError, xgb.core._maybe_pandas_data, df,
|
||||
pandas_handler = xgb.data.PandasHandler(np.nan, 0, False)
|
||||
self.assertRaises(ValueError, pandas_handler._maybe_pandas_data, df,
|
||||
None, None, 'label', 'float')
|
||||
|
||||
# label must be supported dtype
|
||||
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||
self.assertRaises(ValueError, xgb.core._maybe_pandas_data, df,
|
||||
self.assertRaises(ValueError, pandas_handler._maybe_pandas_data, df,
|
||||
None, None, 'label', 'float')
|
||||
|
||||
df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||
result, _, _ = xgb.core._maybe_pandas_data(df, None, None,
|
||||
'label', 'float')
|
||||
result, _, _ = pandas_handler._maybe_pandas_data(df, None, None,
|
||||
'label', 'float')
|
||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
||||
dtype=float))
|
||||
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
|
||||
|
||||
Reference in New Issue
Block a user