From 541580d1575150b984f814d10a04fef49aa243af Mon Sep 17 00:00:00 2001 From: quansie Date: Mon, 12 Oct 2015 14:19:25 +0200 Subject: [PATCH 1/6] Update training.py --- python-package/xgboost/training.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 8ad439678..50d359b15 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -73,12 +73,12 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, if evals_result is not None: res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg) for key in evals_name: - evals_idx = evals_name.index(key) + evals_idx = evals_name.index(key) res_per_eval = len(res) / len(evals_name) for r in range(res_per_eval): res_item = res[(evals_idx*res_per_eval) + r] res_key = res_item[0] - res_val = res_item[1] + res_val = res_item[1] if res_key in evals_result[key]: evals_result[key][res_key].append(res_val) else: @@ -130,12 +130,12 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, if evals_result is not None: res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg) for key in evals_name: - evals_idx = evals_name.index(key) + evals_idx = evals_name.index(key) res_per_eval = len(res) / len(evals_name) for r in range(res_per_eval): res_item = res[(evals_idx*res_per_eval) + r] res_key = res_item[0] - res_val = res_item[1] + res_val = res_item[1] if res_key in evals_result[key]: evals_result[key][res_key].append(res_val) else: From 9bbc3901ee6ea56e8ecddcf0ffdfcc1a554ee199 Mon Sep 17 00:00:00 2001 From: Johan Manders Date: Sat, 17 Oct 2015 15:13:42 +0200 Subject: [PATCH 2/6] More Pandas dtypes and more flexible variable naming - Pandas DataFrame supports more dtypes than 'int64', 'float64' and 'bool', therefor added a bunch of extra dtypes for the data variable. - From now on the label variable can be a Pandas DataFrame with the same dtypes as the data variable. - If label is a Pandas DataFrame will be converted to float. - If no feature_types is set, the data dtypes will be converted to 'int' or 'float'. - The feature_names may contain every character except [, ] or < --- python-package/xgboost/core.py | 69 +++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 0273b7230..c8620ca48 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -138,27 +138,50 @@ def c_array(ctype, values): return (ctype * len(values))(*values) -def _maybe_from_pandas(data, feature_names, feature_types): - """ Extract internal data from pd.DataFrame """ +def _maybe_from_pandas(data, label, feature_names, feature_types): + """ Extract internal data from pd.DataFrame + + If data is Pandas DataFrame, feature_names passed through will be ignored and + overwritten by the column names of the Pandas DataFrame. + """ try: import pandas as pd except ImportError: - return data, feature_names, feature_types + return data, label, feature_names, feature_types if not isinstance(data, pd.DataFrame): - return data, feature_names, feature_types + return data, label, feature_names, feature_types - dtypes = data.dtypes - if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes): - raise ValueError('DataFrame.dtypes must be int, float or bool') + data_dtypes = data.dtypes + if not all(dtype.name in ('int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'bool') for dtype in data_dtypes): + raise ValueError('DataFrame.dtypes for data must be int, float or bool') + + if label is not None: + if isinstance(label, pd.DataFrame): + label_dtypes = label.dtypes + if not all(dtype.name in ('int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'bool') for dtype in label_dtypes): + raise ValueError('DataFrame.dtypes for label must be int, float or bool') + else: + label = label.values.astype('float') + + feature_names = data.columns.format() - if feature_names is None: - feature_names = data.columns.format() if feature_types is None: - mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'} - feature_types = [mapper[dtype.name] for dtype in dtypes] + mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', + 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', + 'float16': 'float', 'float32': 'float', 'float64': 'float', + 'bool': 'int'} + feature_types = [mapper[dtype.name] for dtype in data_dtypes] + data = data.values.astype('float') - return data, feature_names, feature_types + + return data, label, feature_names, feature_types class DMatrix(object): """Data Matrix used in XGBoost. @@ -192,9 +215,10 @@ class DMatrix(object): silent : boolean, optional Whether print messages during construction feature_names : list, optional - Labels for features. + Set names for features. + When data is a Pandas DataFrame, feature_names will be ignored. feature_types : list, optional - Labels for features. + Set types for features. """ # force into void_p, mac need to pass things in as void_p if data is None: @@ -204,8 +228,10 @@ class DMatrix(object): klass = getattr(getattr(data, '__class__', None), '__name__', None) if klass == 'DataFrame': # once check class name to avoid unnecessary pandas import - data, feature_names, feature_types = _maybe_from_pandas(data, feature_names, - feature_types) + data, label, feature_names, feature_types = _maybe_from_pandas(data, + label, + feature_names, + feature_types) if isinstance(data, STRING_TYPES): self.handle = ctypes.c_void_p() @@ -520,10 +546,10 @@ class DMatrix(object): if len(feature_names) != self.num_col(): msg = 'feature_names must have the same length as data' raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - if not all(isinstance(f, STRING_TYPES) and f.isalnum() + # prohibit to use symbols may affect to parse. e.g. []< + if not all(isinstance(f, STRING_TYPES) and not any(x in f for x in {'[', ']', '<'}) for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') + raise ValueError('feature_names may not contain [, ] or <') else: # reset feature_types also self.feature_types = None @@ -556,12 +582,11 @@ class DMatrix(object): if len(feature_types) != self.num_col(): msg = 'feature_types must have the same length as data' raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - valid = ('q', 'i', 'int', 'float') + valid = ('int', 'float') if not all(isinstance(f, STRING_TYPES) and f in valid for f in feature_types): - raise ValueError('all feature_names must be {i, q, int, float}') + raise ValueError('All feature_names must be {int, float}') self._feature_types = feature_types From 7c79c9ac3a580c779ed80639468fe1f71d5c3e61 Mon Sep 17 00:00:00 2001 From: Johan Manders Date: Mon, 19 Oct 2015 17:36:57 +0200 Subject: [PATCH 3/6] Bool gets mapped to i instead of int --- python-package/xgboost/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index c8620ca48..77ef9533b 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -176,7 +176,7 @@ def _maybe_from_pandas(data, label, feature_names, feature_types): mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', 'float16': 'float', 'float32': 'float', 'float64': 'float', - 'bool': 'int'} + 'bool': 'i'} feature_types = [mapper[dtype.name] for dtype in data_dtypes] data = data.values.astype('float') @@ -215,7 +215,7 @@ class DMatrix(object): silent : boolean, optional Whether print messages during construction feature_names : list, optional - Set names for features. + Set names for features. When data is a Pandas DataFrame, feature_names will be ignored. feature_types : list, optional Set types for features. @@ -583,10 +583,10 @@ class DMatrix(object): msg = 'feature_types must have the same length as data' raise ValueError(msg) - valid = ('int', 'float') + valid = ('int', 'float', 'i', 'q') if not all(isinstance(f, STRING_TYPES) and f in valid for f in feature_types): - raise ValueError('All feature_names must be {int, float}') + raise ValueError('All feature_names must be {int, float, i, q}') self._feature_types = feature_types From f9e1b2b7b7b78a092bc8c8aa40b727f865f0396f Mon Sep 17 00:00:00 2001 From: Johan Manders Date: Tue, 3 Nov 2015 21:26:11 +0100 Subject: [PATCH 4/6] Added back feature names --- python-package/xgboost/core.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index d27c34f64..93a73152c 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -139,11 +139,7 @@ def c_array(ctype, values): def _maybe_from_pandas(data, label, feature_names, feature_types): - """ Extract internal data from pd.DataFrame - - If data is Pandas DataFrame, feature_names passed through will be ignored and - overwritten by the column names of the Pandas DataFrame. - """ + """ Extract internal data from pd.DataFrame """ try: import pandas as pd except ImportError: @@ -170,7 +166,8 @@ def _maybe_from_pandas(data, label, feature_names, feature_types): else: label = label.values.astype('float') - feature_names = data.columns.format() + if feature_names is None: + feature_names = data.columns.format() if feature_types is None: mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', @@ -216,7 +213,6 @@ class DMatrix(object): Whether print messages during construction feature_names : list, optional Set names for features. - When data is a Pandas DataFrame, feature_names will be ignored. feature_types : list, optional Set types for features. """ From b0f38e93529c93e5ce25196cd00e08de295570d7 Mon Sep 17 00:00:00 2001 From: Johan Manders Date: Tue, 3 Nov 2015 21:32:47 +0100 Subject: [PATCH 5/6] Changed 4 tests Changed symbol test to give error on < sign, not on = sign Changed 3 other functions, so that float is used instead of q --- tests/python/test_basic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index a8e0d5238..db112372f 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -48,7 +48,7 @@ class TestBasic(unittest.TestCase): feature_names=['a', 'b', 'c', 'd', 'd']) # contains symbol self.assertRaises(ValueError, xgb.DMatrix, data, - feature_names=['a', 'b', 'c', 'd', 'e=1']) + feature_names=['a', 'b', 'c', 'd', 'e<1']) dm = xgb.DMatrix(data) dm.feature_names = list('abcde') @@ -105,7 +105,7 @@ class TestBasic(unittest.TestCase): df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ['a', 'b', 'c'] - assert dm.feature_types == ['int', 'q', 'i'] + assert dm.feature_types == ['int', 'float', 'i'] assert dm.num_row() == 2 assert dm.num_col() == 3 @@ -125,14 +125,14 @@ class TestBasic(unittest.TestCase): df = pd.DataFrame([[1, 2., True], [2, 3., False]]) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ['0', '1', '2'] - assert dm.feature_types == ['int', 'q', 'i'] + assert dm.feature_types == ['int', 'float', 'i'] assert dm.num_row() == 2 assert dm.num_col() == 3 df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6]) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ['4', '5', '6'] - assert dm.feature_types == ['int', 'q', 'int'] + assert dm.feature_types == ['int', 'float', 'int'] assert dm.num_row() == 2 assert dm.num_col() == 3 @@ -293,4 +293,4 @@ class TestBasic(unittest.TestCase): assert isinstance(g, Digraph) ax = xgb.plot_tree(classifier, num_trees=0) - assert isinstance(ax, Axes) \ No newline at end of file + assert isinstance(ax, Axes) From 5f0f8749d90f585ccf0deb61a7ff8ec28cefa7af Mon Sep 17 00:00:00 2001 From: Johan Manders Date: Wed, 4 Nov 2015 18:05:47 +0100 Subject: [PATCH 6/6] Cleaned up some code --- python-package/xgboost/core.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 93a73152c..a91019a8c 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -148,20 +148,19 @@ def _maybe_from_pandas(data, label, feature_names, feature_types): if not isinstance(data, pd.DataFrame): return data, label, feature_names, feature_types + mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', + 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', + 'float16': 'float', 'float32': 'float', 'float64': 'float', + 'bool': 'i'} + data_dtypes = data.dtypes - if not all(dtype.name in ('int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', - 'bool') for dtype in data_dtypes): + if not all(dtype.name in (mapper.keys()) for dtype in data_dtypes): raise ValueError('DataFrame.dtypes for data must be int, float or bool') if label is not None: if isinstance(label, pd.DataFrame): label_dtypes = label.dtypes - if not all(dtype.name in ('int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', - 'bool') for dtype in label_dtypes): + if not all(dtype.name in (mapper.keys()) for dtype in label_dtypes): raise ValueError('DataFrame.dtypes for label must be int, float or bool') else: label = label.values.astype('float') @@ -170,10 +169,6 @@ def _maybe_from_pandas(data, label, feature_names, feature_types): feature_names = data.columns.format() if feature_types is None: - mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', - 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', - 'float16': 'float', 'float32': 'float', 'float64': 'float', - 'bool': 'i'} feature_types = [mapper[dtype.name] for dtype in data_dtypes] data = data.values.astype('float')