From 541580d1575150b984f814d10a04fef49aa243af Mon Sep 17 00:00:00 2001
From: quansie <quansie@hotmail.com>
Date: Mon, 12 Oct 2015 14:19:25 +0200
Subject: [PATCH 1/6] Update training.py

---
 python-package/xgboost/training.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 8ad439678..50d359b15 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -73,12 +73,12 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                 if evals_result is not None:
                     res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
                     for key in evals_name:
-                        evals_idx =  evals_name.index(key)
+                        evals_idx = evals_name.index(key)
                         res_per_eval = len(res) / len(evals_name)
                         for r in range(res_per_eval):
                             res_item = res[(evals_idx*res_per_eval) + r]
                             res_key = res_item[0]
-                            res_val = res_item[1]                           
+                            res_val = res_item[1]
                             if res_key in evals_result[key]:
                                 evals_result[key][res_key].append(res_val)
                             else:
@@ -130,12 +130,12 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
             if evals_result is not None:
                 res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
                 for key in evals_name:
-                    evals_idx =  evals_name.index(key)
+                    evals_idx = evals_name.index(key)
                     res_per_eval = len(res) / len(evals_name)
                     for r in range(res_per_eval):
                         res_item = res[(evals_idx*res_per_eval) + r]
                         res_key = res_item[0]
-                        res_val = res_item[1]                           
+                        res_val = res_item[1]
                         if res_key in evals_result[key]:
                             evals_result[key][res_key].append(res_val)
                         else:

From 9bbc3901ee6ea56e8ecddcf0ffdfcc1a554ee199 Mon Sep 17 00:00:00 2001
From: Johan Manders <johan@sphereness.com>
Date: Sat, 17 Oct 2015 15:13:42 +0200
Subject: [PATCH 2/6] More Pandas dtypes and more flexible variable naming

- Pandas DataFrame supports more dtypes than 'int64', 'float64' and 'bool', therefor added a bunch of extra dtypes for the data variable.
- From now on the label variable can be a Pandas DataFrame with the same dtypes as the data variable.
- If label is a Pandas DataFrame will be converted to float.
- If no feature_types is set, the data dtypes will be converted to 'int' or 'float'.
- The feature_names may contain every character except [, ] or <
---
 python-package/xgboost/core.py | 69 +++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 22 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 0273b7230..c8620ca48 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -138,27 +138,50 @@ def c_array(ctype, values):
     return (ctype * len(values))(*values)
 
 
-def _maybe_from_pandas(data, feature_names, feature_types):
-    """ Extract internal data from pd.DataFrame """
+def _maybe_from_pandas(data, label, feature_names, feature_types):
+    """ Extract internal data from pd.DataFrame
+
+    If data is Pandas DataFrame, feature_names passed through will be ignored and
+    overwritten by the column names of the Pandas DataFrame.
+    """
     try:
         import pandas as pd
     except ImportError:
-        return data, feature_names, feature_types
+        return data, label, feature_names, feature_types
 
     if not isinstance(data, pd.DataFrame):
-        return data, feature_names, feature_types
+        return data, label, feature_names, feature_types
 
-    dtypes = data.dtypes
-    if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes):
-        raise ValueError('DataFrame.dtypes must be int, float or bool')
+    data_dtypes = data.dtypes
+    if not all(dtype.name in ('int8', 'int16', 'int32', 'int64',
+                              'uint8', 'uint16', 'uint32', 'uint64',
+                              'float16', 'float32', 'float64',
+                              'bool') for dtype in data_dtypes):
+        raise ValueError('DataFrame.dtypes for data must be int, float or bool')
+
+    if label is not None:
+        if isinstance(label, pd.DataFrame):
+            label_dtypes = label.dtypes
+            if not all(dtype.name in ('int8', 'int16', 'int32', 'int64',
+                                      'uint8', 'uint16', 'uint32', 'uint64',
+                                      'float16', 'float32', 'float64',
+                                      'bool') for dtype in label_dtypes):
+                raise ValueError('DataFrame.dtypes for label must be int, float or bool')
+            else:
+                label = label.values.astype('float')
+
+    feature_names = data.columns.format()
 
-    if feature_names is None:
-        feature_names = data.columns.format()
     if feature_types is None:
-        mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'}
-        feature_types = [mapper[dtype.name] for dtype in dtypes]
+        mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
+                  'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
+                  'float16': 'float', 'float32': 'float', 'float64': 'float',
+                  'bool': 'int'}
+        feature_types = [mapper[dtype.name] for dtype in data_dtypes]
+
     data = data.values.astype('float')
-    return data, feature_names, feature_types
+
+    return data, label, feature_names, feature_types
 
 class DMatrix(object):
     """Data Matrix used in XGBoost.
@@ -192,9 +215,10 @@ class DMatrix(object):
         silent : boolean, optional
             Whether print messages during construction
         feature_names : list, optional
-            Labels for features.
+            Set names for features. 
+            When data is a Pandas DataFrame, feature_names will be ignored.
         feature_types : list, optional
-            Labels for features.
+            Set types for features.
         """
         # force into void_p, mac need to pass things in as void_p
         if data is None:
@@ -204,8 +228,10 @@ class DMatrix(object):
         klass = getattr(getattr(data, '__class__', None), '__name__', None)
         if klass == 'DataFrame':
             # once check class name to avoid unnecessary pandas import
-            data, feature_names, feature_types = _maybe_from_pandas(data, feature_names,
-                                                                    feature_types)
+            data, label, feature_names, feature_types = _maybe_from_pandas(data,
+                                                                           label,
+                                                                           feature_names,
+                                                                           feature_types)
 
         if isinstance(data, STRING_TYPES):
             self.handle = ctypes.c_void_p()
@@ -520,10 +546,10 @@ class DMatrix(object):
             if len(feature_names) != self.num_col():
                 msg = 'feature_names must have the same length as data'
                 raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
-            if not all(isinstance(f, STRING_TYPES) and f.isalnum()
+            # prohibit to use symbols may affect to parse. e.g. []<
+            if not all(isinstance(f, STRING_TYPES) and not any(x in f for x in {'[', ']', '<'})
                        for f in feature_names):
-                raise ValueError('all feature_names must be alphanumerics')
+                raise ValueError('feature_names may not contain [, ] or <')
         else:
             # reset feature_types also
             self.feature_types = None
@@ -556,12 +582,11 @@ class DMatrix(object):
             if len(feature_types) != self.num_col():
                 msg = 'feature_types must have the same length as data'
                 raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
 
-            valid = ('q', 'i', 'int', 'float')
+            valid = ('int', 'float')
             if not all(isinstance(f, STRING_TYPES) and f in valid
                        for f in feature_types):
-                raise ValueError('all feature_names must be {i, q, int, float}')
+                raise ValueError('All feature_names must be {int, float}')
         self._feature_types = feature_types
 
 

From 7c79c9ac3a580c779ed80639468fe1f71d5c3e61 Mon Sep 17 00:00:00 2001
From: Johan Manders <johan@sphereness.com>
Date: Mon, 19 Oct 2015 17:36:57 +0200
Subject: [PATCH 3/6] Bool gets mapped to i instead of int

---
 python-package/xgboost/core.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index c8620ca48..77ef9533b 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -176,7 +176,7 @@ def _maybe_from_pandas(data, label, feature_names, feature_types):
         mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
                   'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
                   'float16': 'float', 'float32': 'float', 'float64': 'float',
-                  'bool': 'int'}
+                  'bool': 'i'}
         feature_types = [mapper[dtype.name] for dtype in data_dtypes]
 
     data = data.values.astype('float')
@@ -215,7 +215,7 @@ class DMatrix(object):
         silent : boolean, optional
             Whether print messages during construction
         feature_names : list, optional
-            Set names for features. 
+            Set names for features.
             When data is a Pandas DataFrame, feature_names will be ignored.
         feature_types : list, optional
             Set types for features.
@@ -583,10 +583,10 @@ class DMatrix(object):
                 msg = 'feature_types must have the same length as data'
                 raise ValueError(msg)
 
-            valid = ('int', 'float')
+            valid = ('int', 'float', 'i', 'q')
             if not all(isinstance(f, STRING_TYPES) and f in valid
                        for f in feature_types):
-                raise ValueError('All feature_names must be {int, float}')
+                raise ValueError('All feature_names must be {int, float, i, q}')
         self._feature_types = feature_types
 
 

From f9e1b2b7b7b78a092bc8c8aa40b727f865f0396f Mon Sep 17 00:00:00 2001
From: Johan Manders <johan@sphereness.com>
Date: Tue, 3 Nov 2015 21:26:11 +0100
Subject: [PATCH 4/6] Added back feature names

---
 python-package/xgboost/core.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index d27c34f64..93a73152c 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -139,11 +139,7 @@ def c_array(ctype, values):
 
 
 def _maybe_from_pandas(data, label, feature_names, feature_types):
-    """ Extract internal data from pd.DataFrame
-
-    If data is Pandas DataFrame, feature_names passed through will be ignored and
-    overwritten by the column names of the Pandas DataFrame.
-    """
+    """ Extract internal data from pd.DataFrame """
     try:
         import pandas as pd
     except ImportError:
@@ -170,7 +166,8 @@ def _maybe_from_pandas(data, label, feature_names, feature_types):
             else:
                 label = label.values.astype('float')
 
-    feature_names = data.columns.format()
+    if feature_names is None:
+        feature_names = data.columns.format()
 
     if feature_types is None:
         mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
@@ -216,7 +213,6 @@ class DMatrix(object):
             Whether print messages during construction
         feature_names : list, optional
             Set names for features.
-            When data is a Pandas DataFrame, feature_names will be ignored.
         feature_types : list, optional
             Set types for features.
         """

From b0f38e93529c93e5ce25196cd00e08de295570d7 Mon Sep 17 00:00:00 2001
From: Johan Manders <johan@sphereness.com>
Date: Tue, 3 Nov 2015 21:32:47 +0100
Subject: [PATCH 5/6] Changed 4 tests

Changed symbol test to give error on < sign, not on = sign
Changed 3 other functions, so that float is used instead of q
---
 tests/python/test_basic.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index a8e0d5238..db112372f 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -48,7 +48,7 @@ class TestBasic(unittest.TestCase):
                           feature_names=['a', 'b', 'c', 'd', 'd'])
         # contains symbol
         self.assertRaises(ValueError, xgb.DMatrix, data,
-                          feature_names=['a', 'b', 'c', 'd', 'e=1'])
+                          feature_names=['a', 'b', 'c', 'd', 'e<1'])
 
         dm = xgb.DMatrix(data)
         dm.feature_names = list('abcde')
@@ -105,7 +105,7 @@ class TestBasic(unittest.TestCase):
         df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c'])
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
         assert dm.feature_names == ['a', 'b', 'c']
-        assert dm.feature_types == ['int', 'q', 'i']
+        assert dm.feature_types == ['int', 'float', 'i']
         assert dm.num_row() == 2
         assert dm.num_col() == 3
 
@@ -125,14 +125,14 @@ class TestBasic(unittest.TestCase):
         df = pd.DataFrame([[1, 2., True], [2, 3., False]])
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
         assert dm.feature_names == ['0', '1', '2']
-        assert dm.feature_types == ['int', 'q', 'i']
+        assert dm.feature_types == ['int', 'float', 'i']
         assert dm.num_row() == 2
         assert dm.num_col() == 3
 
         df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
         assert dm.feature_names == ['4', '5', '6']
-        assert dm.feature_types == ['int', 'q', 'int']
+        assert dm.feature_types == ['int', 'float', 'int']
         assert dm.num_row() == 2
         assert dm.num_col() == 3
 
@@ -293,4 +293,4 @@ class TestBasic(unittest.TestCase):
         assert isinstance(g, Digraph)
 
         ax = xgb.plot_tree(classifier, num_trees=0)
-        assert isinstance(ax, Axes)
\ No newline at end of file
+        assert isinstance(ax, Axes)

From 5f0f8749d90f585ccf0deb61a7ff8ec28cefa7af Mon Sep 17 00:00:00 2001
From: Johan Manders <johan@sphereness.com>
Date: Wed, 4 Nov 2015 18:05:47 +0100
Subject: [PATCH 6/6] Cleaned up some code

---
 python-package/xgboost/core.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 93a73152c..a91019a8c 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -148,20 +148,19 @@ def _maybe_from_pandas(data, label, feature_names, feature_types):
     if not isinstance(data, pd.DataFrame):
         return data, label, feature_names, feature_types
 
+    mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
+              'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
+              'float16': 'float', 'float32': 'float', 'float64': 'float',
+              'bool': 'i'}
+
     data_dtypes = data.dtypes
-    if not all(dtype.name in ('int8', 'int16', 'int32', 'int64',
-                              'uint8', 'uint16', 'uint32', 'uint64',
-                              'float16', 'float32', 'float64',
-                              'bool') for dtype in data_dtypes):
+    if not all(dtype.name in (mapper.keys()) for dtype in data_dtypes):
         raise ValueError('DataFrame.dtypes for data must be int, float or bool')
 
     if label is not None:
         if isinstance(label, pd.DataFrame):
             label_dtypes = label.dtypes
-            if not all(dtype.name in ('int8', 'int16', 'int32', 'int64',
-                                      'uint8', 'uint16', 'uint32', 'uint64',
-                                      'float16', 'float32', 'float64',
-                                      'bool') for dtype in label_dtypes):
+            if not all(dtype.name in (mapper.keys()) for dtype in label_dtypes):
                 raise ValueError('DataFrame.dtypes for label must be int, float or bool')
             else:
                 label = label.values.astype('float')
@@ -170,10 +169,6 @@ def _maybe_from_pandas(data, label, feature_names, feature_types):
         feature_names = data.columns.format()
 
     if feature_types is None:
-        mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
-                  'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
-                  'float16': 'float', 'float32': 'float', 'float64': 'float',
-                  'bool': 'i'}
         feature_types = [mapper[dtype.name] for dtype in data_dtypes]
 
     data = data.values.astype('float')