[backport] Handle missing values in dataframe with category dtype. (#7331) (#7413)

* Handle missing values in dataframe with category dtype. (#7331) * Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching. * Fix pick error.
2021-11-10 21:24:46 +08:00
parent 11f8b5cfcd
commit 14c56f05da
13 changed files with 308 additions and 110 deletions
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -138,9 +138,22 @@ class TestPandas:
            X, enable_categorical=True
        )

-        assert np.issubdtype(transformed[:, 0].dtype, np.integer)
        assert transformed[:, 0].min() == 0

+        # test missing value
+        X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
+        X["f0"] = X["f0"].astype("category")
+        arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
+        assert not np.any(arr == -1.0)
+
+        X = X["f0"]
+        with pytest.raises(ValueError):
+            xgb.DMatrix(X, y)
+
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        assert Xy.num_row() == 3
+        assert Xy.num_col() == 1
+
    def test_pandas_sparse(self):
        import pandas as pd
        rows = 100