Merge pull request #488 from sinhrks/pyfeaturenames

Support feature names in Python package
2015-09-15 09:56:55 -07:00
parent bda3282f6d 6063d243eb
commit ae43fd7c7a
6 changed files with 214 additions and 43 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-# pylint: disable=too-many-arguments
+# pylint: disable=too-many-arguments, too-many-branches
 """Core XGBoost Library."""
 from __future__ import absolute_import

@@ -23,8 +23,9 @@ class XGBoostError(Exception):


 if sys.version_info[0] == 3:
-    # pylint: disable=invalid-name
+    # pylint: disable=invalid-name, redefined-builtin
    STRING_TYPES = str,
+    unicode = str
 else:
    # pylint: disable=invalid-name
    STRING_TYPES = basestring,
@@ -131,7 +132,11 @@ class DMatrix(object):
    which is optimized for both memory efficiency and training speed.
    You can construct DMatrix from numpy.arrays
    """
-    def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
+
+    feature_names = None  # for previous version's pickle
+
+    def __init__(self, data, label=None, missing=0.0,
+                 weight=None, silent=False, feature_names=None):
        """
        Data matrix used in XGBoost.

@@ -149,6 +154,8 @@ class DMatrix(object):
            Weight for each instance.
        silent : boolean, optional
            Whether print messages during construction
+        feature_names : list, optional
+            Labels for features.
        """
        # force into void_p, mac need to pass things in as void_p
        if data is None:
@@ -176,6 +183,21 @@ class DMatrix(object):
        if weight is not None:
            self.set_weight(weight)

+        # validate feature name
+        if not feature_names is None:
+            if not isinstance(feature_names, list):
+                feature_names = list(feature_names)
+            if len(feature_names) != len(set(feature_names)):
+                raise ValueError('feature_names must be unique')
+            if len(feature_names) != self.num_col():
+                msg = 'feature_names must have the same length as data'
+                raise ValueError(msg)
+            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
+            if not all(isinstance(f, STRING_TYPES) and f.isalnum()
+                       for f in feature_names):
+                raise ValueError('all feature_names must be alphanumerics')
+        self.feature_names = feature_names
+
    def _init_from_csr(self, csr):
        """
        Initialize data from a CSR matrix.
@@ -391,6 +413,18 @@ class DMatrix(object):
                                         ctypes.byref(ret)))
        return ret.value

+    def num_col(self):
+        """Get the number of columns (features) in the DMatrix.
+
+        Returns
+        -------
+        number of columns : int
+        """
+        ret = ctypes.c_uint()
+        _check_call(_LIB.XGDMatrixNumCol(self.handle,
+                                         ctypes.byref(ret)))
+        return ret.value
+
    def slice(self, rindex):
        """Slice the DMatrix and return a new DMatrix that only contains `rindex`.

@@ -404,7 +438,7 @@ class DMatrix(object):
        res : DMatrix
            A new DMatrix containing only selected indices.
        """
-        res = DMatrix(None)
+        res = DMatrix(None, feature_names=self.feature_names)
        res.handle = ctypes.c_void_p()
        _check_call(_LIB.XGDMatrixSliceDMatrix(self.handle,
                                               c_array(ctypes.c_int, rindex),
@@ -419,6 +453,9 @@ class Booster(object):
    Booster is the model of xgboost, that contains low level routines for
    training, prediction and evaluation.
    """
+
+    feature_names = None
+
    def __init__(self, params=None, cache=(), model_file=None):
        # pylint: disable=invalid-name
        """Initialize the Booster.
@@ -435,6 +472,7 @@ class Booster(object):
        for d in cache:
            if not isinstance(d, DMatrix):
                raise TypeError('invalid cache item: {}'.format(type(d).__name__))
+            self._validate_feature_names(d)
        dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
        self.handle = ctypes.c_void_p()
        _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle)))
@@ -519,6 +557,8 @@ class Booster(object):
        """
        if not isinstance(dtrain, DMatrix):
            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
+        self._validate_feature_names(dtrain)
+
        if fobj is None:
            _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
        else:
@@ -543,6 +583,8 @@ class Booster(object):
            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
        if not isinstance(dtrain, DMatrix):
            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
+        self._validate_feature_names(dtrain)
+
        _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
                                               c_array(ctypes.c_float, grad),
                                               c_array(ctypes.c_float, hess),
@@ -572,6 +614,8 @@ class Booster(object):
                    raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__))
                if not isinstance(d[1], STRING_TYPES):
                    raise TypeError('expected string, got {}'.format(type(d[1]).__name__))
+                self._validate_feature_names(d[0])
+
            dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
            evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
            msg = ctypes.c_char_p()
@@ -605,6 +649,7 @@ class Booster(object):
        result: str
            Evaluation result string.
        """
+        self._validate_feature_names(data)
        return self.eval_set([(data, name)], iteration)

    def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
@@ -642,6 +687,9 @@ class Booster(object):
            option_mask |= 0x01
        if pred_leaf:
            option_mask |= 0x02
+
+        self._validate_feature_names(data)
+
        length = ctypes.c_ulong()
        preds = ctypes.POINTER(ctypes.c_float)()
        _check_call(_LIB.XGBoosterPredict(self.handle, data.handle,
@@ -731,16 +779,46 @@ class Booster(object):
        """
        Returns the dump the model as a list of strings.
        """
+
        length = ctypes.c_ulong()
        sarr = ctypes.POINTER(ctypes.c_char_p)()
-        _check_call(_LIB.XGBoosterDumpModel(self.handle,
-                                            c_str(fmap),
-                                            int(with_stats),
-                                            ctypes.byref(length),
-                                            ctypes.byref(sarr)))
+        if self.feature_names is not None and fmap == '':
+            flen = int(len(self.feature_names))
+            fname = (ctypes.c_char_p * flen)()
+            ftype = (ctypes.c_char_p * flen)()
+
+            # supports quantitative type only
+            # {'q': quantitative, 'i': indicator}
+            if sys.version_info[0] == 3:
+                features = [bytes(f, 'utf-8') for f in self.feature_names]
+                types = [bytes('q', 'utf-8')] * flen
+            else:
+                features = [f.encode('utf-8') if isinstance(f, unicode) else f
+                            for f in self.feature_names]
+                types = ['q'] * flen
+
+            fname[:] = features
+            ftype[:] = types
+            _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle,
+                                                            flen,
+                                                            fname,
+                                                            ftype,
+                                                            int(with_stats),
+                                                            ctypes.byref(length),
+                                                            ctypes.byref(sarr)))
+        else:
+            _check_call(_LIB.XGBoosterDumpModel(self.handle,
+                                                c_str(fmap),
+                                                int(with_stats),
+                                                ctypes.byref(length),
+                                                ctypes.byref(sarr)))
+
        res = []
        for i in range(length.value):
-            res.append(str(sarr[i].decode('ascii')))
+            try:
+                res.append(str(sarr[i].decode('ascii')))
+            except UnicodeDecodeError:
+                res.append(unicode(sarr[i].decode('utf-8')))
        return res

    def get_fscore(self, fmap=''):
@@ -765,3 +843,17 @@ class Booster(object):
                else:
                    fmap[fid] += 1
        return fmap
+
+    def _validate_feature_names(self, data):
+        """
+        Validate Booster and data's feature_names are identical
+        """
+        if self.feature_names is None:
+            self.feature_names = data.feature_names
+        else:
+            # Booster can't accept data with different feature names
+            if self.feature_names != data.feature_names:
+                msg = 'feature_names mismatch: {0} {1}'
+                raise ValueError(msg.format(self.feature_names,
+                                            data.feature_names))
+