Support more input types for categorical data. (#7220)

* Support more input types for categorical data.

* Shorten the type name from "categorical" to "c".
* Tests for np/cp array and scipy csr/csc/coo.
* Specify the type for feature info.
This commit is contained in:
Jiaming Yuan
2021-09-16 20:39:30 +08:00
committed by GitHub
parent 2942dc68e4
commit 0ed979b096
11 changed files with 229 additions and 61 deletions

View File

@@ -285,7 +285,7 @@ void TestCategoricalTreeDump(std::string format, std::string sep) {
pos = str.find(cond_str, pos + 1);
ASSERT_NE(pos, std::string::npos);
fmap.PushBack(0, "feat_0", "categorical");
fmap.PushBack(0, "feat_0", "c");
fmap.PushBack(1, "feat_1", "q");
fmap.PushBack(2, "feat_2", "int");

View File

@@ -172,7 +172,7 @@ Arrow specification.'''
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_categorical(self):
def test_cudf_categorical(self):
import cudf
_X, _y = tm.make_categorical(100, 30, 17, False)
X = cudf.from_pandas(_X)
@@ -180,11 +180,11 @@ Arrow specification.'''
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "categorical" for t in Xy.feature_types)
assert all(t == "c" for t in Xy.feature_types)
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "categorical" for t in Xy.feature_types)
assert all(t == "c" for t in Xy.feature_types)
@pytest.mark.skipif(**tm.no_cudf())

View File

@@ -169,6 +169,19 @@ Arrow specification.'''
X = cp.random.random((n, 2))
xgb.DMatrix(X.toDlpack())
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_categorical(self):
import cupy as cp
n_features = 10
X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
X = cp.asarray(X.values.astype(cp.float32))
y = cp.array(y)
feature_types = ['c'] * n_features
assert isinstance(X, cp.ndarray)
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
@pytest.mark.skipif(**tm.no_cupy())
def test_dlpack_device_dmat(self):
import cupy as cp

View File

@@ -339,3 +339,44 @@ class TestDMatrix:
Xy = xgb.DMatrix(X, y)
assert Xy.num_row() == 10
assert Xy.num_col() == 10
@pytest.mark.skipif(**tm.no_pandas())
def test_np_categorical(self):
n_features = 10
X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
X = X.values.astype(np.float32)
feature_types = ['c'] * n_features
assert isinstance(X, np.ndarray)
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def test_scipy_categorical(self):
from scipy import sparse
n_features = 10
X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
X = X.values.astype(np.float32)
feature_types = ['c'] * n_features
X[1, 3] = np.NAN
X[2, 4] = np.NAN
X = sparse.csr_matrix(X)
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
X = sparse.csc_matrix(X)
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
X = sparse.coo_matrix(X)
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def test_uri_categorical(self):
path = os.path.join(dpath, 'agaricus.txt.train')
feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))

View File

@@ -128,7 +128,7 @@ class TestPandas:
X = pd.DataFrame({'f0': X})
y = rng.randn(rows)
m = xgb.DMatrix(X, y, enable_categorical=True)
assert m.feature_types[0] == 'categorical'
assert m.feature_types[0] == 'c'
def test_pandas_sparse(self):
import pandas as pd