Support more input types for categorical data. (#7220)
* Support more input types for categorical data. * Shorten the type name from "categorical" to "c". * Tests for np/cp array and scipy csr/csc/coo. * Specify the type for feature info.
This commit is contained in:
@@ -285,7 +285,7 @@ void TestCategoricalTreeDump(std::string format, std::string sep) {
|
||||
pos = str.find(cond_str, pos + 1);
|
||||
ASSERT_NE(pos, std::string::npos);
|
||||
|
||||
fmap.PushBack(0, "feat_0", "categorical");
|
||||
fmap.PushBack(0, "feat_0", "c");
|
||||
fmap.PushBack(1, "feat_1", "q");
|
||||
fmap.PushBack(2, "feat_2", "int");
|
||||
|
||||
|
||||
@@ -172,7 +172,7 @@ Arrow specification.'''
|
||||
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_categorical(self):
|
||||
def test_cudf_categorical(self):
|
||||
import cudf
|
||||
_X, _y = tm.make_categorical(100, 30, 17, False)
|
||||
X = cudf.from_pandas(_X)
|
||||
@@ -180,11 +180,11 @@ Arrow specification.'''
|
||||
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert len(Xy.feature_types) == X.shape[1]
|
||||
assert all(t == "categorical" for t in Xy.feature_types)
|
||||
assert all(t == "c" for t in Xy.feature_types)
|
||||
|
||||
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
|
||||
assert len(Xy.feature_types) == X.shape[1]
|
||||
assert all(t == "categorical" for t in Xy.feature_types)
|
||||
assert all(t == "c" for t in Xy.feature_types)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
|
||||
@@ -169,6 +169,19 @@ Arrow specification.'''
|
||||
X = cp.random.random((n, 2))
|
||||
xgb.DMatrix(X.toDlpack())
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_categorical(self):
|
||||
import cupy as cp
|
||||
n_features = 10
|
||||
X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
|
||||
X = cp.asarray(X.values.astype(cp.float32))
|
||||
y = cp.array(y)
|
||||
feature_types = ['c'] * n_features
|
||||
|
||||
assert isinstance(X, cp.ndarray)
|
||||
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_dlpack_device_dmat(self):
|
||||
import cupy as cp
|
||||
|
||||
@@ -339,3 +339,44 @@ class TestDMatrix:
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
assert Xy.num_row() == 10
|
||||
assert Xy.num_col() == 10
|
||||
|
||||
@pytest.mark.skipif(**tm.no_pandas())
|
||||
def test_np_categorical(self):
|
||||
n_features = 10
|
||||
X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
|
||||
X = X.values.astype(np.float32)
|
||||
feature_types = ['c'] * n_features
|
||||
|
||||
assert isinstance(X, np.ndarray)
|
||||
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
def test_scipy_categorical(self):
|
||||
from scipy import sparse
|
||||
n_features = 10
|
||||
X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
|
||||
X = X.values.astype(np.float32)
|
||||
feature_types = ['c'] * n_features
|
||||
|
||||
X[1, 3] = np.NAN
|
||||
X[2, 4] = np.NAN
|
||||
X = sparse.csr_matrix(X)
|
||||
|
||||
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
X = sparse.csc_matrix(X)
|
||||
|
||||
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
X = sparse.coo_matrix(X)
|
||||
|
||||
Xy = xgb.DMatrix(X, y, feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
def test_uri_categorical(self):
|
||||
path = os.path.join(dpath, 'agaricus.txt.train')
|
||||
feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
|
||||
Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
@@ -128,7 +128,7 @@ class TestPandas:
|
||||
X = pd.DataFrame({'f0': X})
|
||||
y = rng.randn(rows)
|
||||
m = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert m.feature_types[0] == 'categorical'
|
||||
assert m.feature_types[0] == 'c'
|
||||
|
||||
def test_pandas_sparse(self):
|
||||
import pandas as pd
|
||||
|
||||
Reference in New Issue
Block a user