Enable categorical data support on Python DMatrix. (#6166)

* Only pandas is recognized.
2020-09-29 11:22:56 +08:00
parent 52c0b3f100
commit 7622b8cdb8
4 changed files with 57 additions and 23 deletions
--- a/tests/python/test_with_modin.py
+++ b/tests/python/test_with_modin.py
@@ -67,7 +67,8 @@ class TestModin(unittest.TestCase):
        # 0  1    1    0    0
        # 1  2    0    1    0
        # 2  3    0    0    1
-        result, _, _ = xgb.data._transform_pandas_df(dummies)
+        result, _, _ = xgb.data._transform_pandas_df(dummies,
+                                                     enable_categorical=False)
        exp = np.array([[1., 1., 0., 0.],
                        [2., 0., 1., 0.],
                        [3., 0., 0., 1.]])
@@ -113,15 +114,15 @@ class TestModin(unittest.TestCase):
        # label must be a single column
        df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
        self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
-                          None, None, 'label', 'float')
+                          False, None, None, 'label', 'float')

        # label must be supported dtype
        df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
        self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
-                          None, None, 'label', 'float')
+                          False, None, None, 'label', 'float')

        df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
-        result, _, _ = xgb.data._transform_pandas_df(df, None, None,
+        result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
                                                     'label', 'float')
        np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
                                                       dtype=float))
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -67,7 +67,8 @@ class TestPandas(unittest.TestCase):
        # 0  1    1    0    0
        # 1  2    0    1    0
        # 2  3    0    0    1
-        result, _, _ = xgb.data._transform_pandas_df(dummies)
+        result, _, _ = xgb.data._transform_pandas_df(dummies,
+                                                     enable_categorical=False)
        exp = np.array([[1., 1., 0., 0.],
                        [2., 0., 1., 0.],
                        [3., 0., 0., 1.]])
@@ -109,6 +110,16 @@ class TestPandas(unittest.TestCase):
        assert dm.num_row() == 2
        assert dm.num_col() == 6

+    def test_pandas_categorical(self):
+        rng = np.random.RandomState(1994)
+        rows = 100
+        X = rng.randint(3, 7, size=rows)
+        X = pd.Series(X, dtype="category")
+        X = pd.DataFrame({'f0': X})
+        y = rng.randn(rows)
+        m = xgb.DMatrix(X, y, enable_categorical=True)
+        assert m.feature_types[0] == 'categorical'
+
    def test_pandas_sparse(self):
        import pandas as pd
        rows = 100
@@ -129,15 +140,15 @@ class TestPandas(unittest.TestCase):
        # label must be a single column
        df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
        self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
-                          None, None, 'label', 'float')
+                          False, None, None, 'label', 'float')

        # label must be supported dtype
        df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
        self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
-                          None, None, 'label', 'float')
+                          False, None, None, 'label', 'float')

        df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
-        result, _, _ = xgb.data._transform_pandas_df(df, None, None,
+        result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
                                                     'label', 'float')
        np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
                                                       dtype=float))