[backport] Handle missing values in dataframe with category dtype. (#7331) (#7413)

* Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.

* Fix pick error.
This commit is contained in:
Jiaming Yuan
2021-11-10 21:24:46 +08:00
committed by GitHub
parent 11f8b5cfcd
commit 14c56f05da
13 changed files with 308 additions and 110 deletions

View File

@@ -138,9 +138,22 @@ class TestPandas:
X, enable_categorical=True
)
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0
# test missing value
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
assert not np.any(arr == -1.0)
X = X["f0"]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
def test_pandas_sparse(self):
import pandas as pd
rows = 100