Categorical data support for cuDF. (#7042)

* Add support in DMatrix.
* Add support in DQM, except for iterator.
This commit is contained in:
Jiaming Yuan
2021-06-17 13:54:33 +08:00
committed by GitHub
parent 5c2d7a18c9
commit d9799b09d0
5 changed files with 129 additions and 106 deletions

View File

@@ -171,6 +171,21 @@ Arrow specification.'''
def test_cudf_metainfo_device_dmatrix(self):
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_categorical(self):
import cudf
_X, _y = tm.make_categorical(100, 30, 17, False)
X = cudf.from_pandas(_X)
y = cudf.from_pandas(_y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "categorical" for t in Xy.feature_types)
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "categorical" for t in Xy.feature_types)
@pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy())

View File

@@ -43,22 +43,8 @@ class TestGPUUpdaters:
assert tm.non_increasing(result['train'][dataset.metric])
def run_categorical_basic(self, rows, cols, rounds, cats):
import pandas as pd
rng = np.random.RandomState(1994)
pd_dict = {}
for i in range(cols):
c = rng.randint(low=0, high=cats+1, size=rows)
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
df = pd.DataFrame(pd_dict)
label = df.iloc[:, 0]
for i in range(0, cols-1):
label += df.iloc[:, i]
label += 1
df = df.astype('category')
onehot = pd.get_dummies(df)
cat = df
onehot, label = tm.make_categorical(rows, cols, cats, True)
cat, _ = tm.make_categorical(rows, cols, cats, False)
by_etl_results = {}
by_builtin_results = {}

View File

@@ -234,6 +234,34 @@ def get_mq2008(dpath):
x_valid, y_valid, qid_valid)
@memory.cache
def make_categorical(
n_samples: int, n_features: int, n_categories: int, onehot_enc: bool
):
import pandas as pd
rng = np.random.RandomState(1994)
pd_dict = {}
for i in range(n_features + 1):
c = rng.randint(low=0, high=n_categories + 1, size=n_samples)
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
df = pd.DataFrame(pd_dict)
label = df.iloc[:, 0]
df = df.iloc[:, 1:]
for i in range(0, n_features):
label += df.iloc[:, i]
label += 1
df = df.astype("category")
if onehot_enc:
cat = pd.get_dummies(df)
else:
cat = df
return cat, label
_unweighted_datasets_strategy = strategies.sampled_from(
[TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),