Support dataframe data format in native XGBoost. (#9828)

- Implement a columnar adapter. - Refactor Python pandas handling code to avoid converting into a single numpy array. - Add support in R for transforming columns. - Support R data.frame and factor type.
2023-12-12 09:56:31 +08:00
parent b3700bbb3f
commit faf0f2df10
21 changed files with 718 additions and 221 deletions
--- a/tests/python/test_with_modin.py
+++ b/tests/python/test_with_modin.py
@@ -16,7 +16,7 @@ pytestmark = pytest.mark.skipif(**tm.no_modin())

 class TestModin:
    @pytest.mark.xfail
-    def test_modin(self):
+    def test_modin(self) -> None:
        df = md.DataFrame([[1, 2., True], [2, 3., False]],
                          columns=['a', 'b', 'c'])
        dm = xgb.DMatrix(df, label=md.Series([1, 2]))
@@ -67,8 +67,8 @@ class TestModin:
                                                     enable_categorical=False)
        exp = np.array([[1., 1., 0., 0.],
                        [2., 0., 1., 0.],
-                        [3., 0., 0., 1.]])
-        np.testing.assert_array_equal(result, exp)
+                        [3., 0., 0., 1.]]).T
+        np.testing.assert_array_equal(result.columns, exp)
        dm = xgb.DMatrix(dummies)
        assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
        assert dm.feature_types == ['int', 'int', 'int', 'int']
@@ -108,20 +108,23 @@ class TestModin:

    def test_modin_label(self):
        # label must be a single column
-        df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
+        df = md.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
        with pytest.raises(ValueError):
-            xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float')
+            xgb.data._transform_pandas_df(df, False, None, None, "label")

        # label must be supported dtype
-        df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
+        df = md.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
        with pytest.raises(ValueError):
-            xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float')
+            xgb.data._transform_pandas_df(df, False, None, None, "label")

-        df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
-        result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
-                                                     'label', 'float')
-        np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
-                                                       dtype=float))
+        df = md.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
+        result, _, _ = xgb.data._transform_pandas_df(
+            df, False, None, None, "label"
+        )
+        np.testing.assert_array_equal(
+            np.stack(result.columns, axis=1),
+            np.array([[1.0], [2.0], [3.0]], dtype=float),
+        )
        dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
        assert dm.num_row() == 3
        assert dm.num_col() == 2
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -105,8 +105,8 @@ class TestPandas:
        result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False)
        exp = np.array(
            [[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]]
-        )
-        np.testing.assert_array_equal(result, exp)
+        ).T
+        np.testing.assert_array_equal(result.columns, exp)
        dm = xgb.DMatrix(dummies, data_split_mode=data_split_mode)
        assert dm.num_row() == 3
        if data_split_mode == DataSplitMode.ROW:
@@ -202,6 +202,20 @@ class TestPandas:
        else:
            assert dm.num_col() == 1 * world_size

+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_multi_target(self) -> None:
+        from sklearn.datasets import make_regression
+
+        X, y = make_regression(n_samples=1024, n_features=4, n_targets=3)
+        ydf = pd.DataFrame({i: y[:, i] for i in range(y.shape[1])})
+
+        Xy = xgb.DMatrix(X, ydf)
+        assert Xy.num_row() == y.shape[0]
+        assert Xy.get_label().size == y.shape[0] * y.shape[1]
+        Xy = xgb.QuantileDMatrix(X, ydf)
+        assert Xy.num_row() == y.shape[0]
+        assert Xy.get_label().size == y.shape[0] * y.shape[1]
+
    def test_slice(self):
        rng = np.random.RandomState(1994)
        rows = 100
@@ -233,13 +247,14 @@ class TestPandas:
            X, enable_categorical=True
        )

-        assert transformed[:, 0].min() == 0
+        assert transformed.columns[0].min() == 0

        # test missing value
        X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
        X["f0"] = X["f0"].astype("category")
        arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
-        assert not np.any(arr == -1.0)
+        for c in arr.columns:
+            assert not np.any(c == -1.0)

        X = X["f0"]
        y = y[: X.shape[0]]
@@ -273,24 +288,25 @@ class TestPandas:
        predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
        np.testing.assert_allclose(predt_sparse, predt_dense)

-    def test_pandas_label(self, data_split_mode=DataSplitMode.ROW):
+    def test_pandas_label(
+        self, data_split_mode: DataSplitMode = DataSplitMode.ROW
+    ) -> None:
        world_size = xgb.collective.get_world_size()
        # label must be a single column
        df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
        with pytest.raises(ValueError):
-            xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
+            xgb.data._transform_pandas_df(df, False, None, None, "label")

        # label must be supported dtype
        df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
        with pytest.raises(ValueError):
-            xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
+            xgb.data._transform_pandas_df(df, False, None, None, "label")

        df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
-        result, _, _ = xgb.data._transform_pandas_df(
-            df, False, None, None, "label", "float"
-        )
+        result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, "label")
        np.testing.assert_array_equal(
-            result, np.array([[1.0], [2.0], [3.0]], dtype=float)
+            np.stack(result.columns, axis=1),
+            np.array([[1.0], [2.0], [3.0]], dtype=float),
        )
        dm = xgb.DMatrix(
            np.random.randn(3, 2), label=df, data_split_mode=data_split_mode
@@ -507,6 +523,35 @@ class TestPandas:
                np.testing.assert_allclose(m_orig.get_label(), m_etype.get_label())
                np.testing.assert_allclose(m_etype.get_label(), y.values)

+    @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
+    def test_mixed_type(self, DMatrixT: Type[xgb.DMatrix]) -> None:
+        f0 = np.arange(0, 4)
+        f1 = pd.Series(f0, dtype="int64[pyarrow]")
+        f2l = list(f0)
+        f2l[0] = pd.NA
+        f2 = pd.Series(f2l, dtype=pd.Int64Dtype())
+
+        df = pd.DataFrame({"f0": f0})
+        df["f2"] = f2
+
+        m = DMatrixT(df)
+        assert m.num_col() == df.shape[1]
+
+        df["f1"] = f1
+        m = DMatrixT(df)
+        assert m.num_col() == df.shape[1]
+        assert m.num_row() == df.shape[0]
+        assert m.num_nonmissing() == df.size - 1
+        assert m.feature_names == list(map(str, df.columns))
+        assert m.feature_types == ["int"] * df.shape[1]
+
+        y = f0
+        m.set_info(label=y)
+        booster = xgb.train({}, m)
+        p0 = booster.inplace_predict(df)
+        p1 = booster.predict(m)
+        np.testing.assert_allclose(p0, p1)
+
    @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows")
    def test_pandas_column_split(self):
        tm.run_with_rabit(