Support categorical data for dask functional interface and DQM. (#7043)

* Support categorical data for dask functional interface and DQM. * Implement categorical data support for GPU GK-merge. * Add support for dask functional interface. * Add support for DQM. * Get newer cupy.
2021-06-18 13:06:52 +08:00
parent 7dd29ffd47
commit 86715e4cd4
16 changed files with 364 additions and 167 deletions
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -19,7 +19,7 @@ ENV PATH=/opt/python/bin:$PATH
 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.7 cudf=21.08* rmm=21.08* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda dask-cudf cupy \
+        python=3.7 cudf=21.08* rmm=21.08* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda dask-cudf cupy=9.1* \
        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis

 ENV GOSU_VERSION 1.10
--- a/tests/cpp/data/test_iterative_device_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_device_dmatrix.cu
@@ -68,7 +68,16 @@ void TestEquivalent(float sparsity) {
    auto const& buffer_from_iter = page_concatenated->gidx_buffer;
    auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
    ASSERT_NE(buffer_from_data.Size(), 0);
-    ASSERT_EQ(buffer_from_data.ConstHostVector(), buffer_from_data.ConstHostVector());
+
+    common::CompressedIterator<uint32_t> data_buf{
+        buffer_from_data.ConstHostPointer(), from_data.NumSymbols()};
+    common::CompressedIterator<uint32_t> data_iter{
+        buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()};
+    CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols());
+    CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
+    for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
+      CHECK_EQ(data_buf[i], data_iter[i]);
+    }
  }
 }

--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -225,6 +225,9 @@ void TestCategoricalPrediction(std::string name) {
  row[split_ind] = split_cat;
  auto m = GetDMatrixFromData(row, 1, kCols);

+  std::vector<FeatureType> types(10, FeatureType::kCategorical);
+  m->Info().feature_types.HostVector() = types;
+
  predictor->InitOutPredictions(m->Info(), &out_predictions.predictions, model);
  predictor->PredictBatch(m.get(), &out_predictions, model, 0);
  ASSERT_EQ(out_predictions.predictions.Size(), 1ul);
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -225,19 +225,28 @@ class IterForDMatrixTest(xgb.core.DataIter):
    ROWS_PER_BATCH = 100            # data is splited by rows
    BATCHES = 16

-    def __init__(self):
+    def __init__(self, categorical):
        '''Generate some random data for demostration.

        Actual data can be anything that is currently supported by XGBoost.
        '''
        import cudf
        self.rows = self.ROWS_PER_BATCH
-        rng = np.random.RandomState(1994)
-        self._data = [
-            cudf.DataFrame(
-                {'a': rng.randn(self.ROWS_PER_BATCH),
-                 'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
-        self._labels = [rng.randn(self.rows)] * self.BATCHES
+
+        if categorical:
+            self._data = []
+            self._labels = []
+            for i in range(self.BATCHES):
+                X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False)
+                self._data.append(cudf.from_pandas(X))
+                self._labels.append(y)
+        else:
+            rng = np.random.RandomState(1994)
+            self._data = [
+                cudf.DataFrame(
+                    {'a': rng.randn(self.ROWS_PER_BATCH),
+                     'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
+            self._labels = [rng.randn(self.rows)] * self.BATCHES

        self.it = 0             # set iterator to 0
        super().__init__()
@@ -272,24 +281,26 @@ class IterForDMatrixTest(xgb.core.DataIter):


@pytest.mark.skipif(**tm.no_cudf())
-def test_from_cudf_iter():
+@pytest.mark.parametrize("enable_categorical", [True, False])
+def test_from_cudf_iter(enable_categorical):
    rounds = 100
-    it = IterForDMatrixTest()
+    it = IterForDMatrixTest(enable_categorical)
+    params = {"tree_method": "gpu_hist"}

    # Use iterator
-    m_it = xgb.DeviceQuantileDMatrix(it)
-    reg_with_it = xgb.train({'tree_method': 'gpu_hist'}, m_it,
-                            num_boost_round=rounds)
-    predict_with_it = reg_with_it.predict(m_it)
+    m_it = xgb.DeviceQuantileDMatrix(it, enable_categorical=enable_categorical)
+    reg_with_it = xgb.train(params, m_it, num_boost_round=rounds)

-    # Without using iterator
-    m = xgb.DMatrix(it.as_array(), it.as_array_labels())
+    X = it.as_array()
+    y = it.as_array_labels()
+
+    m = xgb.DMatrix(X, y, enable_categorical=enable_categorical)

    assert m_it.num_col() == m.num_col()
    assert m_it.num_row() == m.num_row()

-    reg = xgb.train({'tree_method': 'gpu_hist'}, m,
-                    num_boost_round=rounds)
-    predict = reg.predict(m)
+    reg = xgb.train(params, m, num_boost_round=rounds)

+    predict = reg.predict(m)
+    predict_with_it = reg_with_it.predict(m_it)
    np.testing.assert_allclose(predict_with_it, predict)
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -1,11 +1,13 @@
 import sys
 import os
-from typing import Type, TypeVar, Any, Dict, List
+from typing import Type, TypeVar, Any, Dict, List, Tuple
 import pytest
 import numpy as np
 import asyncio
 import xgboost
 import subprocess
+import tempfile
+import json
 from collections import OrderedDict
 from inspect import signature
 from hypothesis import given, strategies, settings, note
@@ -41,6 +43,49 @@ except ImportError:
    pass


+def make_categorical(
+    client: Client,
+    n_samples: int,
+    n_features: int,
+    n_categories: int,
+    onehot: bool = False,
+) -> Tuple[dd.DataFrame, dd.Series]:
+    workers = _get_client_workers(client)
+    n_workers = len(workers)
+    dfs = []
+
+    def pack(**kwargs: Any) -> dd.DataFrame:
+        X, y = tm.make_categorical(**kwargs)
+        X["label"] = y
+        return X
+
+    meta = pack(
+        n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False
+    )
+
+    for i, worker in enumerate(workers):
+        l_n_samples = min(
+            n_samples // n_workers, n_samples - i * (n_samples // n_workers)
+        )
+        future = client.submit(
+            pack,
+            n_samples=l_n_samples,
+            n_features=n_features,
+            n_categories=n_categories,
+            onehot=False,
+            workers=[worker],
+        )
+        dfs.append(future)
+
+    df = dd.from_delayed(dfs, meta=meta)
+    y = df["label"]
+    X = df[df.columns.difference(["label"])]
+
+    if onehot:
+        return dd.get_dummies(X), y
+    return X, y
+
+
 def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
    import cupy as cp
    cp.cuda.runtime.setDevice(0)
@@ -126,6 +171,62 @@ def run_with_dask_array(DMatrixT: Type, client: Client) -> None:
        inplace_predictions)


+@pytest.mark.skipif(**tm.no_dask_cudf())
+def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
+    with Client(local_cuda_cluster) as client:
+        import dask_cudf
+
+        rounds = 10
+        X, y = make_categorical(client, 10000, 30, 13)
+        X = dask_cudf.from_dask_dataframe(X)
+
+        X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
+        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
+
+        parameters = {"tree_method": "gpu_hist"}
+
+        m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
+        by_etl_results = dxgb.train(
+            client,
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+        )["history"]
+
+        m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True)
+        output = dxgb.train(
+            client,
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+        )
+        by_builtin_results = output["history"]
+
+        np.testing.assert_allclose(
+            np.array(by_etl_results["Train"]["rmse"]),
+            np.array(by_builtin_results["Train"]["rmse"]),
+            rtol=1e-3,
+        )
+        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+        model = output["booster"]
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = os.path.join(tempdir, "model.json")
+            model.save_model(path)
+            with open(path, "r") as fd:
+                categorical = json.load(fd)
+
+            categories_sizes = np.array(
+                categorical["learner"]["gradient_booster"]["model"]["trees"][-1][
+                    "categories_sizes"
+                ]
+            )
+            assert categories_sizes.shape[0] != 0
+            np.testing.assert_allclose(categories_sizes, 1)
+
+
 def to_cp(x: Any, DMatrixT: Type) -> Any:
    import cupy
    if isinstance(x, np.ndarray) and \
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -236,7 +236,7 @@ def get_mq2008(dpath):

@memory.cache
 def make_categorical(
-    n_samples: int, n_features: int, n_categories: int, onehot_enc: bool
+    n_samples: int, n_features: int, n_categories: int, onehot: bool
 ):
    import pandas as pd

@@ -244,7 +244,7 @@ def make_categorical(

    pd_dict = {}
    for i in range(n_features + 1):
-        c = rng.randint(low=0, high=n_categories + 1, size=n_samples)
+        c = rng.randint(low=0, high=n_categories, size=n_samples)
        pd_dict[str(i)] = pd.Series(c, dtype=np.int64)

    df = pd.DataFrame(pd_dict)
@@ -255,11 +255,13 @@ def make_categorical(
    label += 1

    df = df.astype("category")
-    if onehot_enc:
-        cat = pd.get_dummies(df)
-    else:
-        cat = df
-    return cat, label
+    categories = np.arange(0, n_categories)
+    for col in df.columns:
+        df[col] = df[col].cat.set_categories(categories)
+
+    if onehot:
+        return pd.get_dummies(df), label
+    return df, label


 _unweighted_datasets_strategy = strategies.sampled_from(