Categorical data support for cuDF. (#7042)

* Add support in DMatrix. * Add support in DQM, except for iterator.
2021-06-17 13:54:33 +08:00
parent 5c2d7a18c9
commit d9799b09d0
5 changed files with 129 additions and 106 deletions
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -171,6 +171,21 @@ Arrow specification.'''
    def test_cudf_metainfo_device_dmatrix(self):
        _test_cudf_metainfo(xgb.DeviceQuantileDMatrix)

+    @pytest.mark.skipif(**tm.no_cudf())
+    def test_categorical(self):
+        import cudf
+        _X, _y = tm.make_categorical(100, 30, 17, False)
+        X = cudf.from_pandas(_X)
+        y = cudf.from_pandas(_y)
+
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        assert len(Xy.feature_types) == X.shape[1]
+        assert all(t == "categorical" for t in Xy.feature_types)
+
+        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
+        assert len(Xy.feature_types) == X.shape[1]
+        assert all(t == "categorical" for t in Xy.feature_types)
+

@pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy())
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -43,22 +43,8 @@ class TestGPUUpdaters:
        assert tm.non_increasing(result['train'][dataset.metric])

    def run_categorical_basic(self, rows, cols, rounds, cats):
-        import pandas as pd
-        rng = np.random.RandomState(1994)
-
-        pd_dict = {}
-        for i in range(cols):
-            c = rng.randint(low=0, high=cats+1, size=rows)
-            pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
-
-        df = pd.DataFrame(pd_dict)
-        label = df.iloc[:, 0]
-        for i in range(0, cols-1):
-            label += df.iloc[:, i]
-        label += 1
-        df = df.astype('category')
-        onehot = pd.get_dummies(df)
-        cat = df
+        onehot, label = tm.make_categorical(rows, cols, cats, True)
+        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -234,6 +234,34 @@ def get_mq2008(dpath):
            x_valid, y_valid, qid_valid)


+@memory.cache
+def make_categorical(
+    n_samples: int, n_features: int, n_categories: int, onehot_enc: bool
+):
+    import pandas as pd
+
+    rng = np.random.RandomState(1994)
+
+    pd_dict = {}
+    for i in range(n_features + 1):
+        c = rng.randint(low=0, high=n_categories + 1, size=n_samples)
+        pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
+
+    df = pd.DataFrame(pd_dict)
+    label = df.iloc[:, 0]
+    df = df.iloc[:, 1:]
+    for i in range(0, n_features):
+        label += df.iloc[:, i]
+    label += 1
+
+    df = df.astype("category")
+    if onehot_enc:
+        cat = pd.get_dummies(df)
+    else:
+        cat = df
+    return cat, label
+
+
 _unweighted_datasets_strategy = strategies.sampled_from(
    [TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
     TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),