Support dataframe data format in native XGBoost. (#9828)

- Implement a columnar adapter.
- Refactor Python pandas handling code to avoid converting into a single numpy array.
- Add support in R for transforming columns.
- Support R data.frame and factor type.
This commit is contained in:
Jiaming Yuan
2023-12-12 09:56:31 +08:00
committed by GitHub
parent b3700bbb3f
commit faf0f2df10
21 changed files with 718 additions and 221 deletions

View File

@@ -16,7 +16,7 @@ pytestmark = pytest.mark.skipif(**tm.no_modin())
class TestModin:
@pytest.mark.xfail
def test_modin(self):
def test_modin(self) -> None:
df = md.DataFrame([[1, 2., True], [2, 3., False]],
columns=['a', 'b', 'c'])
dm = xgb.DMatrix(df, label=md.Series([1, 2]))
@@ -67,8 +67,8 @@ class TestModin:
enable_categorical=False)
exp = np.array([[1., 1., 0., 0.],
[2., 0., 1., 0.],
[3., 0., 0., 1.]])
np.testing.assert_array_equal(result, exp)
[3., 0., 0., 1.]]).T
np.testing.assert_array_equal(result.columns, exp)
dm = xgb.DMatrix(dummies)
assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
assert dm.feature_types == ['int', 'int', 'int', 'int']
@@ -108,20 +108,23 @@ class TestModin:
def test_modin_label(self):
# label must be a single column
df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
df = md.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
with pytest.raises(ValueError):
xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float')
xgb.data._transform_pandas_df(df, False, None, None, "label")
# label must be supported dtype
df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
df = md.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
with pytest.raises(ValueError):
xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float')
xgb.data._transform_pandas_df(df, False, None, None, "label")
df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
'label', 'float')
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
dtype=float))
df = md.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
result, _, _ = xgb.data._transform_pandas_df(
df, False, None, None, "label"
)
np.testing.assert_array_equal(
np.stack(result.columns, axis=1),
np.array([[1.0], [2.0], [3.0]], dtype=float),
)
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
assert dm.num_row() == 3
assert dm.num_col() == 2