Support categorical data for dask functional interface and DQM. (#7043)
* Support categorical data for dask functional interface and DQM. * Implement categorical data support for GPU GK-merge. * Add support for dask functional interface. * Add support for DQM. * Get newer cupy.
This commit is contained in:
@@ -19,7 +19,7 @@ ENV PATH=/opt/python/bin:$PATH
|
||||
# Create new Conda environment with cuDF, Dask, and cuPy
|
||||
RUN \
|
||||
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||
python=3.7 cudf=21.08* rmm=21.08* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda dask-cudf cupy \
|
||||
python=3.7 cudf=21.08* rmm=21.08* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda dask-cudf cupy=9.1* \
|
||||
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
|
||||
@@ -68,7 +68,16 @@ void TestEquivalent(float sparsity) {
|
||||
auto const& buffer_from_iter = page_concatenated->gidx_buffer;
|
||||
auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
|
||||
ASSERT_NE(buffer_from_data.Size(), 0);
|
||||
ASSERT_EQ(buffer_from_data.ConstHostVector(), buffer_from_data.ConstHostVector());
|
||||
|
||||
common::CompressedIterator<uint32_t> data_buf{
|
||||
buffer_from_data.ConstHostPointer(), from_data.NumSymbols()};
|
||||
common::CompressedIterator<uint32_t> data_iter{
|
||||
buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()};
|
||||
CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols());
|
||||
CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
|
||||
for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
|
||||
CHECK_EQ(data_buf[i], data_iter[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -225,6 +225,9 @@ void TestCategoricalPrediction(std::string name) {
|
||||
row[split_ind] = split_cat;
|
||||
auto m = GetDMatrixFromData(row, 1, kCols);
|
||||
|
||||
std::vector<FeatureType> types(10, FeatureType::kCategorical);
|
||||
m->Info().feature_types.HostVector() = types;
|
||||
|
||||
predictor->InitOutPredictions(m->Info(), &out_predictions.predictions, model);
|
||||
predictor->PredictBatch(m.get(), &out_predictions, model, 0);
|
||||
ASSERT_EQ(out_predictions.predictions.Size(), 1ul);
|
||||
|
||||
@@ -225,19 +225,28 @@ class IterForDMatrixTest(xgb.core.DataIter):
|
||||
ROWS_PER_BATCH = 100 # data is splited by rows
|
||||
BATCHES = 16
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, categorical):
|
||||
'''Generate some random data for demostration.
|
||||
|
||||
Actual data can be anything that is currently supported by XGBoost.
|
||||
'''
|
||||
import cudf
|
||||
self.rows = self.ROWS_PER_BATCH
|
||||
rng = np.random.RandomState(1994)
|
||||
self._data = [
|
||||
cudf.DataFrame(
|
||||
{'a': rng.randn(self.ROWS_PER_BATCH),
|
||||
'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
|
||||
self._labels = [rng.randn(self.rows)] * self.BATCHES
|
||||
|
||||
if categorical:
|
||||
self._data = []
|
||||
self._labels = []
|
||||
for i in range(self.BATCHES):
|
||||
X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False)
|
||||
self._data.append(cudf.from_pandas(X))
|
||||
self._labels.append(y)
|
||||
else:
|
||||
rng = np.random.RandomState(1994)
|
||||
self._data = [
|
||||
cudf.DataFrame(
|
||||
{'a': rng.randn(self.ROWS_PER_BATCH),
|
||||
'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
|
||||
self._labels = [rng.randn(self.rows)] * self.BATCHES
|
||||
|
||||
self.it = 0 # set iterator to 0
|
||||
super().__init__()
|
||||
@@ -272,24 +281,26 @@ class IterForDMatrixTest(xgb.core.DataIter):
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_from_cudf_iter():
|
||||
@pytest.mark.parametrize("enable_categorical", [True, False])
|
||||
def test_from_cudf_iter(enable_categorical):
|
||||
rounds = 100
|
||||
it = IterForDMatrixTest()
|
||||
it = IterForDMatrixTest(enable_categorical)
|
||||
params = {"tree_method": "gpu_hist"}
|
||||
|
||||
# Use iterator
|
||||
m_it = xgb.DeviceQuantileDMatrix(it)
|
||||
reg_with_it = xgb.train({'tree_method': 'gpu_hist'}, m_it,
|
||||
num_boost_round=rounds)
|
||||
predict_with_it = reg_with_it.predict(m_it)
|
||||
m_it = xgb.DeviceQuantileDMatrix(it, enable_categorical=enable_categorical)
|
||||
reg_with_it = xgb.train(params, m_it, num_boost_round=rounds)
|
||||
|
||||
# Without using iterator
|
||||
m = xgb.DMatrix(it.as_array(), it.as_array_labels())
|
||||
X = it.as_array()
|
||||
y = it.as_array_labels()
|
||||
|
||||
m = xgb.DMatrix(X, y, enable_categorical=enable_categorical)
|
||||
|
||||
assert m_it.num_col() == m.num_col()
|
||||
assert m_it.num_row() == m.num_row()
|
||||
|
||||
reg = xgb.train({'tree_method': 'gpu_hist'}, m,
|
||||
num_boost_round=rounds)
|
||||
predict = reg.predict(m)
|
||||
reg = xgb.train(params, m, num_boost_round=rounds)
|
||||
|
||||
predict = reg.predict(m)
|
||||
predict_with_it = reg_with_it.predict(m_it)
|
||||
np.testing.assert_allclose(predict_with_it, predict)
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import sys
|
||||
import os
|
||||
from typing import Type, TypeVar, Any, Dict, List
|
||||
from typing import Type, TypeVar, Any, Dict, List, Tuple
|
||||
import pytest
|
||||
import numpy as np
|
||||
import asyncio
|
||||
import xgboost
|
||||
import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
from inspect import signature
|
||||
from hypothesis import given, strategies, settings, note
|
||||
@@ -41,6 +43,49 @@ except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def make_categorical(
|
||||
client: Client,
|
||||
n_samples: int,
|
||||
n_features: int,
|
||||
n_categories: int,
|
||||
onehot: bool = False,
|
||||
) -> Tuple[dd.DataFrame, dd.Series]:
|
||||
workers = _get_client_workers(client)
|
||||
n_workers = len(workers)
|
||||
dfs = []
|
||||
|
||||
def pack(**kwargs: Any) -> dd.DataFrame:
|
||||
X, y = tm.make_categorical(**kwargs)
|
||||
X["label"] = y
|
||||
return X
|
||||
|
||||
meta = pack(
|
||||
n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False
|
||||
)
|
||||
|
||||
for i, worker in enumerate(workers):
|
||||
l_n_samples = min(
|
||||
n_samples // n_workers, n_samples - i * (n_samples // n_workers)
|
||||
)
|
||||
future = client.submit(
|
||||
pack,
|
||||
n_samples=l_n_samples,
|
||||
n_features=n_features,
|
||||
n_categories=n_categories,
|
||||
onehot=False,
|
||||
workers=[worker],
|
||||
)
|
||||
dfs.append(future)
|
||||
|
||||
df = dd.from_delayed(dfs, meta=meta)
|
||||
y = df["label"]
|
||||
X = df[df.columns.difference(["label"])]
|
||||
|
||||
if onehot:
|
||||
return dd.get_dummies(X), y
|
||||
return X, y
|
||||
|
||||
|
||||
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
|
||||
import cupy as cp
|
||||
cp.cuda.runtime.setDevice(0)
|
||||
@@ -126,6 +171,62 @@ def run_with_dask_array(DMatrixT: Type, client: Client) -> None:
|
||||
inplace_predictions)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_dask_cudf())
|
||||
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
|
||||
with Client(local_cuda_cluster) as client:
|
||||
import dask_cudf
|
||||
|
||||
rounds = 10
|
||||
X, y = make_categorical(client, 10000, 30, 13)
|
||||
X = dask_cudf.from_dask_dataframe(X)
|
||||
|
||||
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
|
||||
X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
|
||||
|
||||
parameters = {"tree_method": "gpu_hist"}
|
||||
|
||||
m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
|
||||
by_etl_results = dxgb.train(
|
||||
client,
|
||||
parameters,
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
)["history"]
|
||||
|
||||
m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True)
|
||||
output = dxgb.train(
|
||||
client,
|
||||
parameters,
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
)
|
||||
by_builtin_results = output["history"]
|
||||
|
||||
np.testing.assert_allclose(
|
||||
np.array(by_etl_results["Train"]["rmse"]),
|
||||
np.array(by_builtin_results["Train"]["rmse"]),
|
||||
rtol=1e-3,
|
||||
)
|
||||
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
|
||||
|
||||
model = output["booster"]
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
path = os.path.join(tempdir, "model.json")
|
||||
model.save_model(path)
|
||||
with open(path, "r") as fd:
|
||||
categorical = json.load(fd)
|
||||
|
||||
categories_sizes = np.array(
|
||||
categorical["learner"]["gradient_booster"]["model"]["trees"][-1][
|
||||
"categories_sizes"
|
||||
]
|
||||
)
|
||||
assert categories_sizes.shape[0] != 0
|
||||
np.testing.assert_allclose(categories_sizes, 1)
|
||||
|
||||
|
||||
def to_cp(x: Any, DMatrixT: Type) -> Any:
|
||||
import cupy
|
||||
if isinstance(x, np.ndarray) and \
|
||||
|
||||
@@ -236,7 +236,7 @@ def get_mq2008(dpath):
|
||||
|
||||
@memory.cache
|
||||
def make_categorical(
|
||||
n_samples: int, n_features: int, n_categories: int, onehot_enc: bool
|
||||
n_samples: int, n_features: int, n_categories: int, onehot: bool
|
||||
):
|
||||
import pandas as pd
|
||||
|
||||
@@ -244,7 +244,7 @@ def make_categorical(
|
||||
|
||||
pd_dict = {}
|
||||
for i in range(n_features + 1):
|
||||
c = rng.randint(low=0, high=n_categories + 1, size=n_samples)
|
||||
c = rng.randint(low=0, high=n_categories, size=n_samples)
|
||||
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
|
||||
|
||||
df = pd.DataFrame(pd_dict)
|
||||
@@ -255,11 +255,13 @@ def make_categorical(
|
||||
label += 1
|
||||
|
||||
df = df.astype("category")
|
||||
if onehot_enc:
|
||||
cat = pd.get_dummies(df)
|
||||
else:
|
||||
cat = df
|
||||
return cat, label
|
||||
categories = np.arange(0, n_categories)
|
||||
for col in df.columns:
|
||||
df[col] = df[col].cat.set_categories(categories)
|
||||
|
||||
if onehot:
|
||||
return pd.get_dummies(df), label
|
||||
return df, label
|
||||
|
||||
|
||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||
|
||||
Reference in New Issue
Block a user