Merge pull request #597 from JohanManders/python-pandas-dtypes

Python pandas dtypes
2015-11-09 18:08:41 -06:00 · 2015-11-09 18:08:41 -06:00 · 1dd96b6cdc
commit 1dd96b6cdc
parent 7491413de5 5f0f8749d9
2 changed files with 40 additions and 24 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -138,27 +138,42 @@ def c_array(ctype, values):
    return (ctype * len(values))(*values)
-def _maybe_from_pandas(data, feature_names, feature_types):
+def _maybe_from_pandas(data, label, feature_names, feature_types):
    """ Extract internal data from pd.DataFrame """
    try:
        import pandas as pd
    except ImportError:
-        return data, feature_names, feature_types
+        return data, label, feature_names, feature_types
    if not isinstance(data, pd.DataFrame):
-        return data, feature_names, feature_types
+        return data, label, feature_names, feature_types
-    dtypes = data.dtypes
+    mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
-    if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes):
+              'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
-        raise ValueError('DataFrame.dtypes must be int, float or bool')
+              'float16': 'float', 'float32': 'float', 'float64': 'float',
              'bool': 'i'}
    data_dtypes = data.dtypes
    if not all(dtype.name in (mapper.keys()) for dtype in data_dtypes):
        raise ValueError('DataFrame.dtypes for data must be int, float or bool')
    if label is not None:
        if isinstance(label, pd.DataFrame):
            label_dtypes = label.dtypes
            if not all(dtype.name in (mapper.keys()) for dtype in label_dtypes):
                raise ValueError('DataFrame.dtypes for label must be int, float or bool')
            else:
                label = label.values.astype('float')
    if feature_names is None:
        feature_names = data.columns.format()
    if feature_types is None:
-        mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'}
+        feature_types = [mapper[dtype.name] for dtype in data_dtypes]
-        feature_types = [mapper[dtype.name] for dtype in dtypes]
+
    data = data.values.astype('float')
-    return data, feature_names, feature_types
+
    return data, label, feature_names, feature_types
 class DMatrix(object):
    """Data Matrix used in XGBoost.
@ -192,9 +207,9 @@ class DMatrix(object):
        silent : boolean, optional
            Whether print messages during construction
        feature_names : list, optional
-            Labels for features.
+            Set names for features.
        feature_types : list, optional
-            Labels for features.
+            Set types for features.
        """
        # force into void_p, mac need to pass things in as void_p
        if data is None:
@ -204,8 +219,10 @@ class DMatrix(object):
        klass = getattr(getattr(data, '__class__', None), '__name__', None)
        if klass == 'DataFrame':
            # once check class name to avoid unnecessary pandas import
-            data, feature_names, feature_types = _maybe_from_pandas(data, feature_names,
+            data, label, feature_names, feature_types = _maybe_from_pandas(data,
-                                                                    feature_types)
+                                                                           label,
                                                                           feature_names,
                                                                           feature_types)
        if isinstance(data, STRING_TYPES):
            self.handle = ctypes.c_void_p()
@ -520,10 +537,10 @@ class DMatrix(object):
            if len(feature_names) != self.num_col():
                msg = 'feature_names must have the same length as data'
                raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
+            # prohibit to use symbols may affect to parse. e.g. []<
-            if not all(isinstance(f, STRING_TYPES) and f.isalnum()
+            if not all(isinstance(f, STRING_TYPES) and not any(x in f for x in {'[', ']', '<'})
                       for f in feature_names):
-                raise ValueError('all feature_names must be alphanumerics')
+                raise ValueError('feature_names may not contain [, ] or <')
        else:
            # reset feature_types also
            self.feature_types = None
@ -556,12 +573,11 @@ class DMatrix(object):
            if len(feature_types) != self.num_col():
                msg = 'feature_types must have the same length as data'
                raise ValueError(msg)
            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
-            valid = ('q', 'i', 'int', 'float')
+            valid = ('int', 'float', 'i', 'q')
            if not all(isinstance(f, STRING_TYPES) and f in valid
                       for f in feature_types):
-                raise ValueError('all feature_names must be {i, q, int, float}')
+                raise ValueError('All feature_names must be {int, float, i, q}')
        self._feature_types = feature_types
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@ -48,7 +48,7 @@ class TestBasic(unittest.TestCase):
                          feature_names=['a', 'b', 'c', 'd', 'd'])
        # contains symbol
        self.assertRaises(ValueError, xgb.DMatrix, data,
-                          feature_names=['a', 'b', 'c', 'd', 'e=1'])
+                          feature_names=['a', 'b', 'c', 'd', 'e<1'])
        dm = xgb.DMatrix(data)
        dm.feature_names = list('abcde')
@ -105,7 +105,7 @@ class TestBasic(unittest.TestCase):
        df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c'])
        dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
        assert dm.feature_names == ['a', 'b', 'c']
-        assert dm.feature_types == ['int', 'q', 'i']
+        assert dm.feature_types == ['int', 'float', 'i']
        assert dm.num_row() == 2
        assert dm.num_col() == 3
@ -125,14 +125,14 @@ class TestBasic(unittest.TestCase):
        df = pd.DataFrame([[1, 2., True], [2, 3., False]])
        dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
        assert dm.feature_names == ['0', '1', '2']
-        assert dm.feature_types == ['int', 'q', 'i']
+        assert dm.feature_types == ['int', 'float', 'i']
        assert dm.num_row() == 2
        assert dm.num_col() == 3
        df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
        dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
        assert dm.feature_names == ['4', '5', '6']
-        assert dm.feature_types == ['int', 'q', 'int']
+        assert dm.feature_types == ['int', 'float', 'int']
        assert dm.num_row() == 2
        assert dm.num_col() == 3
@ -293,4 +293,4 @@ class TestBasic(unittest.TestCase):
        assert isinstance(g, Digraph)
        ax = xgb.plot_tree(classifier, num_trees=0)
-        assert isinstance(ax, Axes)
+        assert isinstance(ax, Axes)