From db692a30e5c0930d48689a8f88be06f64aa698a9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 17 Sep 2015 22:46:12 +0900 Subject: [PATCH 1/2] Add feature_types --- python-package/xgboost/core.py | 128 +++++++++++++++++++++++------ python-package/xgboost/plotting.py | 27 ++++-- tests/python/test_basic.py | 17 ++++ 3 files changed, 137 insertions(+), 35 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 77a73d95b..1e14fac7b 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -147,9 +147,11 @@ class DMatrix(object): """ feature_names = None # for previous version's pickle + feature_types = None def __init__(self, data, label=None, missing=0.0, - weight=None, silent=False, feature_names=None): + weight=None, silent=False, + feature_names=None, feature_types=None): """ Data matrix used in XGBoost. @@ -169,6 +171,8 @@ class DMatrix(object): Whether print messages during construction feature_names : list, optional Labels for features. + feature_types : list, optional + Labels for features. """ # force into void_p, mac need to pass things in as void_p if data is None: @@ -196,20 +200,8 @@ class DMatrix(object): if weight is not None: self.set_weight(weight) - # validate feature name - if not feature_names is None: - if not isinstance(feature_names, list): - feature_names = list(feature_names) - if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') - if len(feature_names) != self.num_col(): - msg = 'feature_names must have the same length as data' - raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - if not all(isinstance(f, STRING_TYPES) and f.isalnum() - for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') - self.feature_names = feature_names + self.set_feature_names(feature_names) + self.set_feature_types(feature_types) def _init_from_csr(self, csr): """ @@ -389,6 +381,66 @@ class DMatrix(object): c_array(ctypes.c_uint, group), len(group))) + def set_feature_names(self, feature_names): + """Set feature names (column labels). + + Parameters + ---------- + feature_names : list or None + Labels for features. None will reset existing feature names + """ + if not feature_names is None: + # validate feature name + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + msg = 'feature_names must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') + else: + # reset feature_types also + self.set_feature_types(None) + self.feature_names = feature_names + + def set_feature_types(self, feature_types): + """Set feature types (column types). + + This is for displaying the results and unrelated + to the learning process. + + Parameters + ---------- + feature_types : list or None + Labels for features. None will reset existing feature names + """ + if not feature_types is None: + + if self.feature_names is None: + msg = 'Unable to set feature types before setting names' + raise ValueError(msg) + + if isinstance(feature_types, STRING_TYPES): + # single string will be applied to all columns + feature_types = [feature_types] * self.num_col() + + if not isinstance(feature_types, list): + feature_types = list(feature_types) + if len(feature_types) != self.num_col(): + msg = 'feature_types must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + + valid = ('q', 'i', 'int', 'float') + if not all(isinstance(f, STRING_TYPES) and f in valid + for f in feature_types): + raise ValueError('all feature_names must be {i, q, int, float}') + self.feature_types = feature_types + def get_label(self): """Get the label of the DMatrix. @@ -416,6 +468,24 @@ class DMatrix(object): """ return self.get_float_info('base_margin') + def get_feature_names(self): + """Get feature names (column labels). + + Returns + ------- + feature_names : list or None + """ + return self.feature_names + + def get_feature_types(self): + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ + return self.feature_types + def num_row(self): """Get the number of rows in the DMatrix. @@ -487,7 +557,8 @@ class Booster(object): for d in cache: if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) - self._validate_feature_names(d) + self._validate_features(d) + dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle = ctypes.c_void_p() _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle))) @@ -572,7 +643,7 @@ class Booster(object): """ if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) - self._validate_feature_names(dtrain) + self._validate_features(dtrain) if fobj is None: _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)) @@ -598,7 +669,7 @@ class Booster(object): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) - self._validate_feature_names(dtrain) + self._validate_features(dtrain) _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, c_array(ctypes.c_float, grad), @@ -629,7 +700,7 @@ class Booster(object): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) - self._validate_feature_names(d[0]) + self._validate_features(d[0]) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) @@ -664,7 +735,7 @@ class Booster(object): result: str Evaluation result string. """ - self._validate_feature_names(data) + self._validate_features(data) return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): @@ -703,7 +774,7 @@ class Booster(object): if pred_leaf: option_mask |= 0x02 - self._validate_feature_names(data) + self._validate_features(data) length = ctypes.c_ulong() preds = ctypes.POINTER(ctypes.c_float)() @@ -805,9 +876,12 @@ class Booster(object): fname = from_pystr_to_cstr(self.feature_names) - # supports quantitative type only - # {'q': quantitative, 'i': indicator} - ftype = from_pystr_to_cstr(['q'] * flen) + if self.feature_types is None: + # use quantitative as default + # {'q': quantitative, 'i': indicator} + ftype = from_pystr_to_cstr(['q'] * flen) + else: + ftype = from_pystr_to_cstr(self.feature_types) _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle, flen, fname, @@ -849,12 +923,14 @@ class Booster(object): fmap[fid] += 1 return fmap - def _validate_feature_names(self, data): + def _validate_features(self, data): """ - Validate Booster and data's feature_names are identical + Validate Booster and data's feature_names are identical. + Set feature_names and feature_types from DMatrix """ if self.feature_names is None: self.feature_names = data.feature_names + self.feature_types = data.feature_types else: # Booster can't accept data with different feature names if self.feature_names != data.feature_names: diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 9c9b2a97d..50a844a1e 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -92,7 +92,7 @@ def plot_importance(booster, ax=None, height=0.2, _NODEPAT = re.compile(r'(\d+):\[(.+)\]') _LEAFPAT = re.compile(r'(\d+):(leaf=.+)') _EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)') - +_EDGEPAT2 = re.compile(r'yes=(\d+),no=(\d+)') def _parse_node(graph, text): """parse dumped node""" @@ -111,15 +111,24 @@ def _parse_node(graph, text): def _parse_edge(graph, node, text, yes_color='#0000FF', no_color='#FF0000'): """parse dumped edge""" - match = _EDGEPAT.match(text) + try: + match = _EDGEPAT.match(text) + if match is not None: + yes, no, missing = match.groups() + if yes == missing: + graph.edge(node, yes, label='yes, missing', color=yes_color) + graph.edge(node, no, label='no', color=no_color) + else: + graph.edge(node, yes, label='yes', color=yes_color) + graph.edge(node, no, label='no, missing', color=no_color) + return + except ValueError: + pass + match = _EDGEPAT2.match(text) if match is not None: - yes, no, missing = match.groups() - if yes == missing: - graph.edge(node, yes, label='yes, missing', color=yes_color) - graph.edge(node, no, label='no', color=no_color) - else: - graph.edge(node, yes, label='yes', color=yes_color) - graph.edge(node, no, label='no, missing', color=no_color) + yes, no = match.groups() + graph.edge(node, yes, label='yes', color=yes_color) + graph.edge(node, no, label='no', color=no_color) return raise ValueError('Unable to parse edge: {0}'.format(text)) diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index e7c0629ca..7dc905794 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -47,6 +47,23 @@ class TestBasic(unittest.TestCase): self.assertRaises(ValueError, xgb.DMatrix, data, feature_names=['a', 'b', 'c', 'd', 'e=1']) + dm = xgb.DMatrix(data) + dm.set_feature_names(list('abcde')) + assert dm.get_feature_names() == list('abcde') + + dm.set_feature_types('q') + assert dm.get_feature_types() == list('qqqqq') + + dm.set_feature_types(list('qiqiq')) + assert dm.get_feature_types() == list('qiqiq') + + self.assertRaises(ValueError, dm.set_feature_types, list('abcde')) + + # reset + dm.set_feature_names(None) + assert dm.get_feature_names() is None + assert dm.get_feature_types() is None + def test_feature_names(self): data = np.random.randn(100, 5) target = np.array([0, 1] * 50) From f6f3473d17d3ae0acfd614994ebe6194cafca1c3 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 28 Sep 2015 22:36:39 +0900 Subject: [PATCH 2/2] Change to properties --- python-package/xgboost/core.py | 168 +++++++++++++++++---------------- tests/python/test_basic.py | 22 +++-- 2 files changed, 98 insertions(+), 92 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 1e14fac7b..aaddc43fb 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -146,8 +146,8 @@ class DMatrix(object): You can construct DMatrix from numpy.arrays """ - feature_names = None # for previous version's pickle - feature_types = None + _feature_names = None # for previous version's pickle + _feature_types = None def __init__(self, data, label=None, missing=0.0, weight=None, silent=False, @@ -200,8 +200,8 @@ class DMatrix(object): if weight is not None: self.set_weight(weight) - self.set_feature_names(feature_names) - self.set_feature_types(feature_types) + self.feature_names = feature_names + self.feature_types = feature_types def _init_from_csr(self, csr): """ @@ -381,66 +381,6 @@ class DMatrix(object): c_array(ctypes.c_uint, group), len(group))) - def set_feature_names(self, feature_names): - """Set feature names (column labels). - - Parameters - ---------- - feature_names : list or None - Labels for features. None will reset existing feature names - """ - if not feature_names is None: - # validate feature name - if not isinstance(feature_names, list): - feature_names = list(feature_names) - if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') - if len(feature_names) != self.num_col(): - msg = 'feature_names must have the same length as data' - raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - if not all(isinstance(f, STRING_TYPES) and f.isalnum() - for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') - else: - # reset feature_types also - self.set_feature_types(None) - self.feature_names = feature_names - - def set_feature_types(self, feature_types): - """Set feature types (column types). - - This is for displaying the results and unrelated - to the learning process. - - Parameters - ---------- - feature_types : list or None - Labels for features. None will reset existing feature names - """ - if not feature_types is None: - - if self.feature_names is None: - msg = 'Unable to set feature types before setting names' - raise ValueError(msg) - - if isinstance(feature_types, STRING_TYPES): - # single string will be applied to all columns - feature_types = [feature_types] * self.num_col() - - if not isinstance(feature_types, list): - feature_types = list(feature_types) - if len(feature_types) != self.num_col(): - msg = 'feature_types must have the same length as data' - raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - - valid = ('q', 'i', 'int', 'float') - if not all(isinstance(f, STRING_TYPES) and f in valid - for f in feature_types): - raise ValueError('all feature_names must be {i, q, int, float}') - self.feature_types = feature_types - def get_label(self): """Get the label of the DMatrix. @@ -468,24 +408,6 @@ class DMatrix(object): """ return self.get_float_info('base_margin') - def get_feature_names(self): - """Get feature names (column labels). - - Returns - ------- - feature_names : list or None - """ - return self.feature_names - - def get_feature_types(self): - """Get feature types (column types). - - Returns - ------- - feature_types : list or None - """ - return self.feature_types - def num_row(self): """Get the number of rows in the DMatrix. @@ -531,6 +453,88 @@ class DMatrix(object): ctypes.byref(res.handle))) return res + @property + def feature_names(self): + """Get feature names (column labels). + + Returns + ------- + feature_names : list or None + """ + return self._feature_names + + @property + def feature_types(self): + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ + return self._feature_types + + @feature_names.setter + def feature_names(self, feature_names): + """Set feature names (column labels). + + Parameters + ---------- + feature_names : list or None + Labels for features. None will reset existing feature names + """ + if not feature_names is None: + # validate feature name + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + msg = 'feature_names must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') + else: + # reset feature_types also + self.feature_types = None + self._feature_names = feature_names + + @feature_types.setter + def feature_types(self, feature_types): + """Set feature types (column types). + + This is for displaying the results and unrelated + to the learning process. + + Parameters + ---------- + feature_types : list or None + Labels for features. None will reset existing feature names + """ + if not feature_types is None: + + if self.feature_names is None: + msg = 'Unable to set feature types before setting names' + raise ValueError(msg) + + if isinstance(feature_types, STRING_TYPES): + # single string will be applied to all columns + feature_types = [feature_types] * self.num_col() + + if not isinstance(feature_types, list): + feature_types = list(feature_types) + if len(feature_types) != self.num_col(): + msg = 'feature_types must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + + valid = ('q', 'i', 'int', 'float') + if not all(isinstance(f, STRING_TYPES) and f in valid + for f in feature_types): + raise ValueError('all feature_names must be {i, q, int, float}') + self._feature_types = feature_types + class Booster(object): """"A Booster of of XGBoost. diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 7dc905794..afbc53e1e 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -48,21 +48,23 @@ class TestBasic(unittest.TestCase): feature_names=['a', 'b', 'c', 'd', 'e=1']) dm = xgb.DMatrix(data) - dm.set_feature_names(list('abcde')) - assert dm.get_feature_names() == list('abcde') + dm.feature_names = list('abcde') + assert dm.feature_names == list('abcde') - dm.set_feature_types('q') - assert dm.get_feature_types() == list('qqqqq') + dm.feature_types = 'q' + assert dm.feature_types == list('qqqqq') - dm.set_feature_types(list('qiqiq')) - assert dm.get_feature_types() == list('qiqiq') + dm.feature_types = list('qiqiq') + assert dm.feature_types == list('qiqiq') - self.assertRaises(ValueError, dm.set_feature_types, list('abcde')) + def incorrect_type_set(): + dm.feature_types = list('abcde') + self.assertRaises(ValueError, incorrect_type_set) # reset - dm.set_feature_names(None) - assert dm.get_feature_names() is None - assert dm.get_feature_types() is None + dm.feature_names = None + assert dm.feature_names is None + assert dm.feature_types is None def test_feature_names(self): data = np.random.randn(100, 5)