Support dataframe data format in native XGBoost. (#9828)
- Implement a columnar adapter. - Refactor Python pandas handling code to avoid converting into a single numpy array. - Add support in R for transforming columns. - Support R data.frame and factor type.
This commit is contained in:
@@ -16,7 +16,7 @@ pytestmark = pytest.mark.skipif(**tm.no_modin())
|
||||
|
||||
class TestModin:
|
||||
@pytest.mark.xfail
|
||||
def test_modin(self):
|
||||
def test_modin(self) -> None:
|
||||
df = md.DataFrame([[1, 2., True], [2, 3., False]],
|
||||
columns=['a', 'b', 'c'])
|
||||
dm = xgb.DMatrix(df, label=md.Series([1, 2]))
|
||||
@@ -67,8 +67,8 @@ class TestModin:
|
||||
enable_categorical=False)
|
||||
exp = np.array([[1., 1., 0., 0.],
|
||||
[2., 0., 1., 0.],
|
||||
[3., 0., 0., 1.]])
|
||||
np.testing.assert_array_equal(result, exp)
|
||||
[3., 0., 0., 1.]]).T
|
||||
np.testing.assert_array_equal(result.columns, exp)
|
||||
dm = xgb.DMatrix(dummies)
|
||||
assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
|
||||
assert dm.feature_types == ['int', 'int', 'int', 'int']
|
||||
@@ -108,20 +108,23 @@ class TestModin:
|
||||
|
||||
def test_modin_label(self):
|
||||
# label must be a single column
|
||||
df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||
df = md.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
|
||||
with pytest.raises(ValueError):
|
||||
xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float')
|
||||
xgb.data._transform_pandas_df(df, False, None, None, "label")
|
||||
|
||||
# label must be supported dtype
|
||||
df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||
df = md.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
|
||||
with pytest.raises(ValueError):
|
||||
xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float')
|
||||
xgb.data._transform_pandas_df(df, False, None, None, "label")
|
||||
|
||||
df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
|
||||
'label', 'float')
|
||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
||||
dtype=float))
|
||||
df = md.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
|
||||
result, _, _ = xgb.data._transform_pandas_df(
|
||||
df, False, None, None, "label"
|
||||
)
|
||||
np.testing.assert_array_equal(
|
||||
np.stack(result.columns, axis=1),
|
||||
np.array([[1.0], [2.0], [3.0]], dtype=float),
|
||||
)
|
||||
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
|
||||
assert dm.num_row() == 3
|
||||
assert dm.num_col() == 2
|
||||
|
||||
Reference in New Issue
Block a user