Merge pull request #597 from JohanManders/python-pandas-dtypes
Python pandas dtypes
This commit is contained in:
commit
1dd96b6cdc
@ -138,27 +138,42 @@ def c_array(ctype, values):
|
|||||||
return (ctype * len(values))(*values)
|
return (ctype * len(values))(*values)
|
||||||
|
|
||||||
|
|
||||||
def _maybe_from_pandas(data, feature_names, feature_types):
|
def _maybe_from_pandas(data, label, feature_names, feature_types):
|
||||||
""" Extract internal data from pd.DataFrame """
|
""" Extract internal data from pd.DataFrame """
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return data, feature_names, feature_types
|
return data, label, feature_names, feature_types
|
||||||
|
|
||||||
if not isinstance(data, pd.DataFrame):
|
if not isinstance(data, pd.DataFrame):
|
||||||
return data, feature_names, feature_types
|
return data, label, feature_names, feature_types
|
||||||
|
|
||||||
dtypes = data.dtypes
|
mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
|
||||||
if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes):
|
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
|
||||||
raise ValueError('DataFrame.dtypes must be int, float or bool')
|
'float16': 'float', 'float32': 'float', 'float64': 'float',
|
||||||
|
'bool': 'i'}
|
||||||
|
|
||||||
|
data_dtypes = data.dtypes
|
||||||
|
if not all(dtype.name in (mapper.keys()) for dtype in data_dtypes):
|
||||||
|
raise ValueError('DataFrame.dtypes for data must be int, float or bool')
|
||||||
|
|
||||||
|
if label is not None:
|
||||||
|
if isinstance(label, pd.DataFrame):
|
||||||
|
label_dtypes = label.dtypes
|
||||||
|
if not all(dtype.name in (mapper.keys()) for dtype in label_dtypes):
|
||||||
|
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
|
||||||
|
else:
|
||||||
|
label = label.values.astype('float')
|
||||||
|
|
||||||
if feature_names is None:
|
if feature_names is None:
|
||||||
feature_names = data.columns.format()
|
feature_names = data.columns.format()
|
||||||
|
|
||||||
if feature_types is None:
|
if feature_types is None:
|
||||||
mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'}
|
feature_types = [mapper[dtype.name] for dtype in data_dtypes]
|
||||||
feature_types = [mapper[dtype.name] for dtype in dtypes]
|
|
||||||
data = data.values.astype('float')
|
data = data.values.astype('float')
|
||||||
return data, feature_names, feature_types
|
|
||||||
|
return data, label, feature_names, feature_types
|
||||||
|
|
||||||
class DMatrix(object):
|
class DMatrix(object):
|
||||||
"""Data Matrix used in XGBoost.
|
"""Data Matrix used in XGBoost.
|
||||||
@ -192,9 +207,9 @@ class DMatrix(object):
|
|||||||
silent : boolean, optional
|
silent : boolean, optional
|
||||||
Whether print messages during construction
|
Whether print messages during construction
|
||||||
feature_names : list, optional
|
feature_names : list, optional
|
||||||
Labels for features.
|
Set names for features.
|
||||||
feature_types : list, optional
|
feature_types : list, optional
|
||||||
Labels for features.
|
Set types for features.
|
||||||
"""
|
"""
|
||||||
# force into void_p, mac need to pass things in as void_p
|
# force into void_p, mac need to pass things in as void_p
|
||||||
if data is None:
|
if data is None:
|
||||||
@ -204,8 +219,10 @@ class DMatrix(object):
|
|||||||
klass = getattr(getattr(data, '__class__', None), '__name__', None)
|
klass = getattr(getattr(data, '__class__', None), '__name__', None)
|
||||||
if klass == 'DataFrame':
|
if klass == 'DataFrame':
|
||||||
# once check class name to avoid unnecessary pandas import
|
# once check class name to avoid unnecessary pandas import
|
||||||
data, feature_names, feature_types = _maybe_from_pandas(data, feature_names,
|
data, label, feature_names, feature_types = _maybe_from_pandas(data,
|
||||||
feature_types)
|
label,
|
||||||
|
feature_names,
|
||||||
|
feature_types)
|
||||||
|
|
||||||
if isinstance(data, STRING_TYPES):
|
if isinstance(data, STRING_TYPES):
|
||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
@ -520,10 +537,10 @@ class DMatrix(object):
|
|||||||
if len(feature_names) != self.num_col():
|
if len(feature_names) != self.num_col():
|
||||||
msg = 'feature_names must have the same length as data'
|
msg = 'feature_names must have the same length as data'
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
# prohibit to use symbols may affect to parse. e.g. ``[]=.``
|
# prohibit to use symbols may affect to parse. e.g. []<
|
||||||
if not all(isinstance(f, STRING_TYPES) and f.isalnum()
|
if not all(isinstance(f, STRING_TYPES) and not any(x in f for x in {'[', ']', '<'})
|
||||||
for f in feature_names):
|
for f in feature_names):
|
||||||
raise ValueError('all feature_names must be alphanumerics')
|
raise ValueError('feature_names may not contain [, ] or <')
|
||||||
else:
|
else:
|
||||||
# reset feature_types also
|
# reset feature_types also
|
||||||
self.feature_types = None
|
self.feature_types = None
|
||||||
@ -556,12 +573,11 @@ class DMatrix(object):
|
|||||||
if len(feature_types) != self.num_col():
|
if len(feature_types) != self.num_col():
|
||||||
msg = 'feature_types must have the same length as data'
|
msg = 'feature_types must have the same length as data'
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
# prohibit to use symbols may affect to parse. e.g. ``[]=.``
|
|
||||||
|
|
||||||
valid = ('q', 'i', 'int', 'float')
|
valid = ('int', 'float', 'i', 'q')
|
||||||
if not all(isinstance(f, STRING_TYPES) and f in valid
|
if not all(isinstance(f, STRING_TYPES) and f in valid
|
||||||
for f in feature_types):
|
for f in feature_types):
|
||||||
raise ValueError('all feature_names must be {i, q, int, float}')
|
raise ValueError('All feature_names must be {int, float, i, q}')
|
||||||
self._feature_types = feature_types
|
self._feature_types = feature_types
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class TestBasic(unittest.TestCase):
|
|||||||
feature_names=['a', 'b', 'c', 'd', 'd'])
|
feature_names=['a', 'b', 'c', 'd', 'd'])
|
||||||
# contains symbol
|
# contains symbol
|
||||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||||
feature_names=['a', 'b', 'c', 'd', 'e=1'])
|
feature_names=['a', 'b', 'c', 'd', 'e<1'])
|
||||||
|
|
||||||
dm = xgb.DMatrix(data)
|
dm = xgb.DMatrix(data)
|
||||||
dm.feature_names = list('abcde')
|
dm.feature_names = list('abcde')
|
||||||
@ -105,7 +105,7 @@ class TestBasic(unittest.TestCase):
|
|||||||
df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c'])
|
df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c'])
|
||||||
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
||||||
assert dm.feature_names == ['a', 'b', 'c']
|
assert dm.feature_names == ['a', 'b', 'c']
|
||||||
assert dm.feature_types == ['int', 'q', 'i']
|
assert dm.feature_types == ['int', 'float', 'i']
|
||||||
assert dm.num_row() == 2
|
assert dm.num_row() == 2
|
||||||
assert dm.num_col() == 3
|
assert dm.num_col() == 3
|
||||||
|
|
||||||
@ -125,14 +125,14 @@ class TestBasic(unittest.TestCase):
|
|||||||
df = pd.DataFrame([[1, 2., True], [2, 3., False]])
|
df = pd.DataFrame([[1, 2., True], [2, 3., False]])
|
||||||
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
||||||
assert dm.feature_names == ['0', '1', '2']
|
assert dm.feature_names == ['0', '1', '2']
|
||||||
assert dm.feature_types == ['int', 'q', 'i']
|
assert dm.feature_types == ['int', 'float', 'i']
|
||||||
assert dm.num_row() == 2
|
assert dm.num_row() == 2
|
||||||
assert dm.num_col() == 3
|
assert dm.num_col() == 3
|
||||||
|
|
||||||
df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
|
df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
|
||||||
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
||||||
assert dm.feature_names == ['4', '5', '6']
|
assert dm.feature_names == ['4', '5', '6']
|
||||||
assert dm.feature_types == ['int', 'q', 'int']
|
assert dm.feature_types == ['int', 'float', 'int']
|
||||||
assert dm.num_row() == 2
|
assert dm.num_row() == 2
|
||||||
assert dm.num_col() == 3
|
assert dm.num_col() == 3
|
||||||
|
|
||||||
@ -293,4 +293,4 @@ class TestBasic(unittest.TestCase):
|
|||||||
assert isinstance(g, Digraph)
|
assert isinstance(g, Digraph)
|
||||||
|
|
||||||
ax = xgb.plot_tree(classifier, num_trees=0)
|
ax = xgb.plot_tree(classifier, num_trees=0)
|
||||||
assert isinstance(ax, Axes)
|
assert isinstance(ax, Axes)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user