Enable categorical data support on Python DMatrix. (#6166)
* Only pandas is recognized.
This commit is contained in:
@@ -67,7 +67,8 @@ class TestModin(unittest.TestCase):
|
||||
# 0 1 1 0 0
|
||||
# 1 2 0 1 0
|
||||
# 2 3 0 0 1
|
||||
result, _, _ = xgb.data._transform_pandas_df(dummies)
|
||||
result, _, _ = xgb.data._transform_pandas_df(dummies,
|
||||
enable_categorical=False)
|
||||
exp = np.array([[1., 1., 0., 0.],
|
||||
[2., 0., 1., 0.],
|
||||
[3., 0., 0., 1.]])
|
||||
@@ -113,15 +114,15 @@ class TestModin(unittest.TestCase):
|
||||
# label must be a single column
|
||||
df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||
None, None, 'label', 'float')
|
||||
False, None, None, 'label', 'float')
|
||||
|
||||
# label must be supported dtype
|
||||
df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||
None, None, 'label', 'float')
|
||||
False, None, None, 'label', 'float')
|
||||
|
||||
df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
|
||||
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
|
||||
'label', 'float')
|
||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
||||
dtype=float))
|
||||
|
||||
@@ -67,7 +67,8 @@ class TestPandas(unittest.TestCase):
|
||||
# 0 1 1 0 0
|
||||
# 1 2 0 1 0
|
||||
# 2 3 0 0 1
|
||||
result, _, _ = xgb.data._transform_pandas_df(dummies)
|
||||
result, _, _ = xgb.data._transform_pandas_df(dummies,
|
||||
enable_categorical=False)
|
||||
exp = np.array([[1., 1., 0., 0.],
|
||||
[2., 0., 1., 0.],
|
||||
[3., 0., 0., 1.]])
|
||||
@@ -109,6 +110,16 @@ class TestPandas(unittest.TestCase):
|
||||
assert dm.num_row() == 2
|
||||
assert dm.num_col() == 6
|
||||
|
||||
def test_pandas_categorical(self):
|
||||
rng = np.random.RandomState(1994)
|
||||
rows = 100
|
||||
X = rng.randint(3, 7, size=rows)
|
||||
X = pd.Series(X, dtype="category")
|
||||
X = pd.DataFrame({'f0': X})
|
||||
y = rng.randn(rows)
|
||||
m = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert m.feature_types[0] == 'categorical'
|
||||
|
||||
def test_pandas_sparse(self):
|
||||
import pandas as pd
|
||||
rows = 100
|
||||
@@ -129,15 +140,15 @@ class TestPandas(unittest.TestCase):
|
||||
# label must be a single column
|
||||
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||
None, None, 'label', 'float')
|
||||
False, None, None, 'label', 'float')
|
||||
|
||||
# label must be supported dtype
|
||||
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||
None, None, 'label', 'float')
|
||||
False, None, None, 'label', 'float')
|
||||
|
||||
df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
|
||||
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
|
||||
'label', 'float')
|
||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
||||
dtype=float))
|
||||
|
||||
Reference in New Issue
Block a user