Fix mixed types with cuDF. (#8280)

This commit is contained in:
Jiaming Yuan
2022-09-29 00:57:52 +08:00
committed by GitHub
parent f835368bcf
commit 6925b222e0
3 changed files with 106 additions and 47 deletions

View File

@@ -1,6 +1,8 @@
import json
import sys
import numpy as np
import xgboost as xgb
import sys
import pytest
sys.path.append("tests/python")
@@ -176,20 +178,38 @@ Arrow specification.'''
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_categorical(self):
def test_cudf_categorical(self) -> None:
import cudf
_X, _y = tm.make_categorical(100, 30, 17, False)
n_features = 30
_X, _y = tm.make_categorical(100, n_features, 17, False)
X = cudf.from_pandas(_X)
y = cudf.from_pandas(_y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.feature_types is not None
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "c" for t in Xy.feature_types)
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
assert Xy.feature_types is not None
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "c" for t in Xy.feature_types)
# mixed dtypes
X["1"] = X["1"].astype(np.int64)
X["3"] = X["3"].astype(np.int64)
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
X, None, None, enable_categorical=True
)
assert X.shape[1] == n_features
assert len(cat_codes) == X.shape[1]
assert not cat_codes[0]
assert not cat_codes[2]
interfaces_str = xgb.data._cudf_array_interfaces(df, cat_codes)
interfaces = json.loads(interfaces_str)
assert len(interfaces) == X.shape[1]
# test missing value
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
@@ -206,7 +226,7 @@ Arrow specification.'''
assert Xy.num_row() == 3
assert Xy.num_col() == 1
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="enable_categorical"):
xgb.DeviceQuantileDMatrix(X, y)
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)