Support categorical data for dask functional interface and DQM. (#7043)

* Support categorical data for dask functional interface and DQM. * Implement categorical data support for GPU GK-merge. * Add support for dask functional interface. * Add support for DQM. * Get newer cupy.
2021-06-18 13:06:52 +08:00
parent 7dd29ffd47
commit 86715e4cd4
16 changed files with 364 additions and 167 deletions
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -225,19 +225,28 @@ class IterForDMatrixTest(xgb.core.DataIter):
    ROWS_PER_BATCH = 100            # data is splited by rows
    BATCHES = 16

-    def __init__(self):
+    def __init__(self, categorical):
        '''Generate some random data for demostration.

        Actual data can be anything that is currently supported by XGBoost.
        '''
        import cudf
        self.rows = self.ROWS_PER_BATCH
-        rng = np.random.RandomState(1994)
-        self._data = [
-            cudf.DataFrame(
-                {'a': rng.randn(self.ROWS_PER_BATCH),
-                 'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
-        self._labels = [rng.randn(self.rows)] * self.BATCHES
+
+        if categorical:
+            self._data = []
+            self._labels = []
+            for i in range(self.BATCHES):
+                X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False)
+                self._data.append(cudf.from_pandas(X))
+                self._labels.append(y)
+        else:
+            rng = np.random.RandomState(1994)
+            self._data = [
+                cudf.DataFrame(
+                    {'a': rng.randn(self.ROWS_PER_BATCH),
+                     'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
+            self._labels = [rng.randn(self.rows)] * self.BATCHES

        self.it = 0             # set iterator to 0
        super().__init__()
@@ -272,24 +281,26 @@ class IterForDMatrixTest(xgb.core.DataIter):


@pytest.mark.skipif(**tm.no_cudf())
-def test_from_cudf_iter():
+@pytest.mark.parametrize("enable_categorical", [True, False])
+def test_from_cudf_iter(enable_categorical):
    rounds = 100
-    it = IterForDMatrixTest()
+    it = IterForDMatrixTest(enable_categorical)
+    params = {"tree_method": "gpu_hist"}

    # Use iterator
-    m_it = xgb.DeviceQuantileDMatrix(it)
-    reg_with_it = xgb.train({'tree_method': 'gpu_hist'}, m_it,
-                            num_boost_round=rounds)
-    predict_with_it = reg_with_it.predict(m_it)
+    m_it = xgb.DeviceQuantileDMatrix(it, enable_categorical=enable_categorical)
+    reg_with_it = xgb.train(params, m_it, num_boost_round=rounds)

-    # Without using iterator
-    m = xgb.DMatrix(it.as_array(), it.as_array_labels())
+    X = it.as_array()
+    y = it.as_array_labels()
+
+    m = xgb.DMatrix(X, y, enable_categorical=enable_categorical)

    assert m_it.num_col() == m.num_col()
    assert m_it.num_row() == m.num_row()

-    reg = xgb.train({'tree_method': 'gpu_hist'}, m,
-                    num_boost_round=rounds)
-    predict = reg.predict(m)
+    reg = xgb.train(params, m, num_boost_round=rounds)

+    predict = reg.predict(m)
+    predict_with_it = reg_with_it.predict(m_it)
    np.testing.assert_allclose(predict_with_it, predict)
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -1,11 +1,13 @@
 import sys
 import os
-from typing import Type, TypeVar, Any, Dict, List
+from typing import Type, TypeVar, Any, Dict, List, Tuple
 import pytest
 import numpy as np
 import asyncio
 import xgboost
 import subprocess
+import tempfile
+import json
 from collections import OrderedDict
 from inspect import signature
 from hypothesis import given, strategies, settings, note
@@ -41,6 +43,49 @@ except ImportError:
    pass


+def make_categorical(
+    client: Client,
+    n_samples: int,
+    n_features: int,
+    n_categories: int,
+    onehot: bool = False,
+) -> Tuple[dd.DataFrame, dd.Series]:
+    workers = _get_client_workers(client)
+    n_workers = len(workers)
+    dfs = []
+
+    def pack(**kwargs: Any) -> dd.DataFrame:
+        X, y = tm.make_categorical(**kwargs)
+        X["label"] = y
+        return X
+
+    meta = pack(
+        n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False
+    )
+
+    for i, worker in enumerate(workers):
+        l_n_samples = min(
+            n_samples // n_workers, n_samples - i * (n_samples // n_workers)
+        )
+        future = client.submit(
+            pack,
+            n_samples=l_n_samples,
+            n_features=n_features,
+            n_categories=n_categories,
+            onehot=False,
+            workers=[worker],
+        )
+        dfs.append(future)
+
+    df = dd.from_delayed(dfs, meta=meta)
+    y = df["label"]
+    X = df[df.columns.difference(["label"])]
+
+    if onehot:
+        return dd.get_dummies(X), y
+    return X, y
+
+
 def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
    import cupy as cp
    cp.cuda.runtime.setDevice(0)
@@ -126,6 +171,62 @@ def run_with_dask_array(DMatrixT: Type, client: Client) -> None:
        inplace_predictions)


+@pytest.mark.skipif(**tm.no_dask_cudf())
+def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
+    with Client(local_cuda_cluster) as client:
+        import dask_cudf
+
+        rounds = 10
+        X, y = make_categorical(client, 10000, 30, 13)
+        X = dask_cudf.from_dask_dataframe(X)
+
+        X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
+        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
+
+        parameters = {"tree_method": "gpu_hist"}
+
+        m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
+        by_etl_results = dxgb.train(
+            client,
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+        )["history"]
+
+        m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True)
+        output = dxgb.train(
+            client,
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+        )
+        by_builtin_results = output["history"]
+
+        np.testing.assert_allclose(
+            np.array(by_etl_results["Train"]["rmse"]),
+            np.array(by_builtin_results["Train"]["rmse"]),
+            rtol=1e-3,
+        )
+        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+        model = output["booster"]
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = os.path.join(tempdir, "model.json")
+            model.save_model(path)
+            with open(path, "r") as fd:
+                categorical = json.load(fd)
+
+            categories_sizes = np.array(
+                categorical["learner"]["gradient_booster"]["model"]["trees"][-1][
+                    "categories_sizes"
+                ]
+            )
+            assert categories_sizes.shape[0] != 0
+            np.testing.assert_allclose(categories_sizes, 1)
+
+
 def to_cp(x: Any, DMatrixT: Type) -> Any:
    import cupy
    if isinstance(x, np.ndarray) and \