Support categorical data for dask functional interface and DQM. (#7043)

* Support categorical data for dask functional interface and DQM.

* Implement categorical data support for GPU GK-merge.
* Add support for dask functional interface.
* Add support for DQM.

* Get newer cupy.
This commit is contained in:
Jiaming Yuan
2021-06-18 13:06:52 +08:00
committed by GitHub
parent 7dd29ffd47
commit 86715e4cd4
16 changed files with 364 additions and 167 deletions

View File

@@ -225,19 +225,28 @@ class IterForDMatrixTest(xgb.core.DataIter):
ROWS_PER_BATCH = 100 # data is splited by rows
BATCHES = 16
def __init__(self):
def __init__(self, categorical):
'''Generate some random data for demostration.
Actual data can be anything that is currently supported by XGBoost.
'''
import cudf
self.rows = self.ROWS_PER_BATCH
rng = np.random.RandomState(1994)
self._data = [
cudf.DataFrame(
{'a': rng.randn(self.ROWS_PER_BATCH),
'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
self._labels = [rng.randn(self.rows)] * self.BATCHES
if categorical:
self._data = []
self._labels = []
for i in range(self.BATCHES):
X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False)
self._data.append(cudf.from_pandas(X))
self._labels.append(y)
else:
rng = np.random.RandomState(1994)
self._data = [
cudf.DataFrame(
{'a': rng.randn(self.ROWS_PER_BATCH),
'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
self._labels = [rng.randn(self.rows)] * self.BATCHES
self.it = 0 # set iterator to 0
super().__init__()
@@ -272,24 +281,26 @@ class IterForDMatrixTest(xgb.core.DataIter):
@pytest.mark.skipif(**tm.no_cudf())
def test_from_cudf_iter():
@pytest.mark.parametrize("enable_categorical", [True, False])
def test_from_cudf_iter(enable_categorical):
rounds = 100
it = IterForDMatrixTest()
it = IterForDMatrixTest(enable_categorical)
params = {"tree_method": "gpu_hist"}
# Use iterator
m_it = xgb.DeviceQuantileDMatrix(it)
reg_with_it = xgb.train({'tree_method': 'gpu_hist'}, m_it,
num_boost_round=rounds)
predict_with_it = reg_with_it.predict(m_it)
m_it = xgb.DeviceQuantileDMatrix(it, enable_categorical=enable_categorical)
reg_with_it = xgb.train(params, m_it, num_boost_round=rounds)
# Without using iterator
m = xgb.DMatrix(it.as_array(), it.as_array_labels())
X = it.as_array()
y = it.as_array_labels()
m = xgb.DMatrix(X, y, enable_categorical=enable_categorical)
assert m_it.num_col() == m.num_col()
assert m_it.num_row() == m.num_row()
reg = xgb.train({'tree_method': 'gpu_hist'}, m,
num_boost_round=rounds)
predict = reg.predict(m)
reg = xgb.train(params, m, num_boost_round=rounds)
predict = reg.predict(m)
predict_with_it = reg_with_it.predict(m_it)
np.testing.assert_allclose(predict_with_it, predict)

View File

@@ -1,11 +1,13 @@
import sys
import os
from typing import Type, TypeVar, Any, Dict, List
from typing import Type, TypeVar, Any, Dict, List, Tuple
import pytest
import numpy as np
import asyncio
import xgboost
import subprocess
import tempfile
import json
from collections import OrderedDict
from inspect import signature
from hypothesis import given, strategies, settings, note
@@ -41,6 +43,49 @@ except ImportError:
pass
def make_categorical(
client: Client,
n_samples: int,
n_features: int,
n_categories: int,
onehot: bool = False,
) -> Tuple[dd.DataFrame, dd.Series]:
workers = _get_client_workers(client)
n_workers = len(workers)
dfs = []
def pack(**kwargs: Any) -> dd.DataFrame:
X, y = tm.make_categorical(**kwargs)
X["label"] = y
return X
meta = pack(
n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False
)
for i, worker in enumerate(workers):
l_n_samples = min(
n_samples // n_workers, n_samples - i * (n_samples // n_workers)
)
future = client.submit(
pack,
n_samples=l_n_samples,
n_features=n_features,
n_categories=n_categories,
onehot=False,
workers=[worker],
)
dfs.append(future)
df = dd.from_delayed(dfs, meta=meta)
y = df["label"]
X = df[df.columns.difference(["label"])]
if onehot:
return dd.get_dummies(X), y
return X, y
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
import cupy as cp
cp.cuda.runtime.setDevice(0)
@@ -126,6 +171,62 @@ def run_with_dask_array(DMatrixT: Type, client: Client) -> None:
inplace_predictions)
@pytest.mark.skipif(**tm.no_dask_cudf())
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
with Client(local_cuda_cluster) as client:
import dask_cudf
rounds = 10
X, y = make_categorical(client, 10000, 30, 13)
X = dask_cudf.from_dask_dataframe(X)
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
parameters = {"tree_method": "gpu_hist"}
m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
by_etl_results = dxgb.train(
client,
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
)["history"]
m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True)
output = dxgb.train(
client,
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
)
by_builtin_results = output["history"]
np.testing.assert_allclose(
np.array(by_etl_results["Train"]["rmse"]),
np.array(by_builtin_results["Train"]["rmse"]),
rtol=1e-3,
)
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
model = output["booster"]
with tempfile.TemporaryDirectory() as tempdir:
path = os.path.join(tempdir, "model.json")
model.save_model(path)
with open(path, "r") as fd:
categorical = json.load(fd)
categories_sizes = np.array(
categorical["learner"]["gradient_booster"]["model"]["trees"][-1][
"categories_sizes"
]
)
assert categories_sizes.shape[0] != 0
np.testing.assert_allclose(categories_sizes, 1)
def to_cp(x: Any, DMatrixT: Type) -> Any:
import cupy
if isinstance(x, np.ndarray) and \