From 6506a1c4905807b7dc4de0db3fbd0b28806d8c13 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 12 Sep 2015 11:32:51 +0900 Subject: [PATCH 1/3] ENH: allow python to handle feature names --- python-package/xgboost/core.py | 76 ++++++++++++++++++++++++++++++++-- wrapper/xgboost_wrapper.cpp | 8 ++++ wrapper/xgboost_wrapper.h | 7 ++++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 41943cd61..2a4782eea 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import os +import re import sys import ctypes import platform @@ -131,7 +132,11 @@ class DMatrix(object): which is optimized for both memory efficiency and training speed. You can construct DMatrix from numpy.arrays """ - def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): + + feature_names = None # for previous version's pickle + + def __init__(self, data, label=None, missing=0.0, + weight=None, silent=False, feature_names=None): """ Data matrix used in XGBoost. @@ -149,6 +154,8 @@ class DMatrix(object): Weight for each instance. silent : boolean, optional Whether print messages during construction + feature_names : list, optional + Labels for features. """ # force into void_p, mac need to pass things in as void_p if data is None: @@ -176,6 +183,18 @@ class DMatrix(object): if weight is not None: self.set_weight(weight) + # validate feature name + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + raise ValueError('feature_names must have the same length as data') + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') + self.feature_names = feature_names + def _init_from_csr(self, csr): """ Initialize data from a CSR matrix. @@ -391,6 +410,18 @@ class DMatrix(object): ctypes.byref(ret))) return ret.value + def num_col(self): + """Get the number of columns in the DMatrix. + + Returns + ------- + number of columns : int + """ + ret = ctypes.c_ulong() + _check_call(_LIB.XGDMatrixNumCol(self.handle, + ctypes.byref(ret))) + return ret.value + def slice(self, rindex): """Slice the DMatrix and return a new DMatrix that only contains `rindex`. @@ -404,7 +435,7 @@ class DMatrix(object): res : DMatrix A new DMatrix containing only selected indices. """ - res = DMatrix(None) + res = DMatrix(None, feature_names=self.feature_names) res.handle = ctypes.c_void_p() _check_call(_LIB.XGDMatrixSliceDMatrix(self.handle, c_array(ctypes.c_int, rindex), @@ -419,6 +450,9 @@ class Booster(object): Booster is the model of xgboost, that contains low level routines for training, prediction and evaluation. """ + + feature_names = None + def __init__(self, params=None, cache=(), model_file=None): # pylint: disable=invalid-name """Initialize the Booster. @@ -435,6 +469,7 @@ class Booster(object): for d in cache: if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) + self._validate_feature_names(d) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle = ctypes.c_void_p() _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle))) @@ -519,6 +554,8 @@ class Booster(object): """ if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) + self._validate_feature_names(dtrain) + if fobj is None: _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)) else: @@ -543,6 +580,8 @@ class Booster(object): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) + self._validate_feature_names(dtrain) + _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, c_array(ctypes.c_float, grad), c_array(ctypes.c_float, hess), @@ -572,6 +611,8 @@ class Booster(object): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) + self._validate_feature_names(d) + dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) msg = ctypes.c_char_p() @@ -605,6 +646,7 @@ class Booster(object): result: str Evaluation result string. """ + self._validate_feature_names(data) return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): @@ -642,6 +684,9 @@ class Booster(object): option_mask |= 0x01 if pred_leaf: option_mask |= 0x02 + + self._validate_feature_names(data) + length = ctypes.c_ulong() preds = ctypes.POINTER(ctypes.c_float)() _check_call(_LIB.XGBoosterPredict(self.handle, data.handle, @@ -731,6 +776,7 @@ class Booster(object): """ Returns the dump the model as a list of strings. """ + res = [] length = ctypes.c_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() _check_call(_LIB.XGBoosterDumpModel(self.handle, @@ -738,9 +784,19 @@ class Booster(object): int(with_stats), ctypes.byref(length), ctypes.byref(sarr))) - res = [] for i in range(length.value): res.append(str(sarr[i].decode('ascii'))) + + if self.feature_names is not None: + defaults = ['f{0}'.format(i) for i in + range(len(self.feature_names))] + rep = dict((re.escape(k), v) for k, v in + zip(defaults, self.feature_names)) + pattern = re.compile("|".join(rep)) + def _replace(expr): + """ Replace matched group to corresponding values """ + return pattern.sub(lambda m: rep[re.escape(m.group(0))], expr) + res = [_replace(r) for r in res] return res def get_fscore(self, fmap=''): @@ -765,3 +821,17 @@ class Booster(object): else: fmap[fid] += 1 return fmap + + def _validate_feature_names(self, data): + """ + Validate Booster and data's feature_names are identical + """ + if self.feature_names is None: + self.feature_names = data.feature_names + else: + # Booster can't accept data with different feature names + if self.feature_names != data.feature_names: + msg = 'feature_names mismatch: {0} {1}' + raise ValueError(msg.format(self.feature_names, + data.feature_names)) + diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index 6956b567d..a6151a567 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -435,6 +435,7 @@ int XGDMatrixGetUIntInfo(const DMatrixHandle handle, *out_dptr = BeginPtr(vec); API_END(); } + int XGDMatrixNumRow(const DMatrixHandle handle, bst_ulong *out) { API_BEGIN(); @@ -442,6 +443,13 @@ int XGDMatrixNumRow(const DMatrixHandle handle, API_END(); } +int XGDMatrixNumCol(const DMatrixHandle handle, + bst_ulong *out) { + API_BEGIN(); + *out = static_cast(static_cast(handle)->info.num_col()); + API_END(); +} + // xgboost implementation int XGBoosterCreate(DMatrixHandle dmats[], bst_ulong len, diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 6d3a619fb..0d688b236 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -184,6 +184,13 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle, */ XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, bst_ulong *out); +/*! + * \brief get number of columns + * \param handle the handle to the DMatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, + bst_ulong *out); // --- start XGBoost class /*! * \brief create xgboost learner From 48ac946d9f2b462701e90a198c3d690751a76a6f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 12 Sep 2015 14:36:17 +0900 Subject: [PATCH 2/3] Use ctypes --- python-package/xgboost/core.py | 88 +++++++++++++++++++++------------- tests/python/test_basic.py | 20 ++++++++ wrapper/xgboost_wrapper.cpp | 23 +++++++-- wrapper/xgboost_wrapper.h | 20 ++++++++ 4 files changed, 115 insertions(+), 36 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 2a4782eea..bcb68580e 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1,10 +1,9 @@ # coding: utf-8 -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments, too-many-branches """Core XGBoost Library.""" from __future__ import absolute_import import os -import re import sys import ctypes import platform @@ -24,8 +23,9 @@ class XGBoostError(Exception): if sys.version_info[0] == 3: - # pylint: disable=invalid-name + # pylint: disable=invalid-name, redefined-builtin STRING_TYPES = str, + unicode = str else: # pylint: disable=invalid-name STRING_TYPES = basestring, @@ -184,15 +184,18 @@ class DMatrix(object): self.set_weight(weight) # validate feature name - if not isinstance(feature_names, list): - feature_names = list(feature_names) - if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') - if len(feature_names) != self.num_col(): - raise ValueError('feature_names must have the same length as data') - if not all(isinstance(f, STRING_TYPES) and f.isalnum() - for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') + if not feature_names is None: + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + msg = 'feature_names must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') self.feature_names = feature_names def _init_from_csr(self, csr): @@ -411,13 +414,13 @@ class DMatrix(object): return ret.value def num_col(self): - """Get the number of columns in the DMatrix. + """Get the number of columns (features) in the DMatrix. Returns ------- number of columns : int """ - ret = ctypes.c_ulong() + ret = ctypes.c_uint() _check_call(_LIB.XGDMatrixNumCol(self.handle, ctypes.byref(ret))) return ret.value @@ -611,7 +614,7 @@ class Booster(object): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) - self._validate_feature_names(d) + self._validate_feature_names(d[0]) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) @@ -776,27 +779,46 @@ class Booster(object): """ Returns the dump the model as a list of strings. """ - res = [] + length = ctypes.c_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() - _check_call(_LIB.XGBoosterDumpModel(self.handle, - c_str(fmap), - int(with_stats), - ctypes.byref(length), - ctypes.byref(sarr))) - for i in range(length.value): - res.append(str(sarr[i].decode('ascii'))) + if self.feature_names is not None and fmap == '': + flen = int(len(self.feature_names)) + fname = (ctypes.c_char_p * flen)() + ftype = (ctypes.c_char_p * flen)() - if self.feature_names is not None: - defaults = ['f{0}'.format(i) for i in - range(len(self.feature_names))] - rep = dict((re.escape(k), v) for k, v in - zip(defaults, self.feature_names)) - pattern = re.compile("|".join(rep)) - def _replace(expr): - """ Replace matched group to corresponding values """ - return pattern.sub(lambda m: rep[re.escape(m.group(0))], expr) - res = [_replace(r) for r in res] + # supports quantitative type only + # {'q': quantitative, 'i': indicator} + if sys.version_info[0] == 3: + features = [bytes(f, 'utf-8') for f in self.feature_names] + types = [bytes('q', 'utf-8')] * flen + else: + features = [f.encode('utf-8') if isinstance(f, unicode) else f + for f in self.feature_names] + types = ['q'] * flen + + fname[:] = features + ftype[:] = types + _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle, + flen, + fname, + ftype, + int(with_stats), + ctypes.byref(length), + ctypes.byref(sarr))) + else: + _check_call(_LIB.XGBoosterDumpModel(self.handle, + c_str(fmap), + int(with_stats), + ctypes.byref(length), + ctypes.byref(sarr))) + + res = [] + for i in range(length.value): + try: + res.append(str(sarr[i].decode('ascii'))) + except UnicodeDecodeError: + res.append(unicode(sarr[i].decode('utf-8'))) return res def get_fscore(self, fmap=''): diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 93ebaa7fd..70de2626c 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -29,6 +29,26 @@ def test_basic(): # assert they are the same assert np.sum(np.abs(preds2-preds)) == 0 +def test_feature_names(): + data = np.random.randn(100, 5) + target = np.array([0, 1] * 50) + + features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] + dm = xgb.DMatrix(data, label=target, + feature_names=features) + assert dm.feature_names == features + assert dm.num_row() == 100 + assert dm.num_col() == 5 + + params={'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + scores = bst.get_fscore() + assert list(sorted(k for k in scores)) == features + def test_plotting(): bst2 = xgb.Booster(model_file='xgb.model') # plotting diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index a6151a567..6d547fe18 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -445,9 +445,9 @@ int XGDMatrixNumRow(const DMatrixHandle handle, int XGDMatrixNumCol(const DMatrixHandle handle, bst_ulong *out) { - API_BEGIN(); - *out = static_cast(static_cast(handle)->info.num_col()); - API_END(); + API_BEGIN(); + *out = static_cast(static_cast(handle)->info.num_col()); + API_END(); } // xgboost implementation @@ -580,3 +580,20 @@ int XGBoosterDumpModel(BoosterHandle handle, featmap, with_stats != 0, len); API_END(); } + +int XGBoosterDumpModelWithFeatures(BoosterHandle handle, + int fnum, + const char **fname, + const char **ftype, + int with_stats, + bst_ulong *len, + const char ***out_models) { + API_BEGIN(); + utils::FeatMap featmap; + for (int i = 0; i < fnum; ++i) { + featmap.PushBack(i, fname[i], ftype[i]); + } + *out_models = static_cast(handle)->GetModelDump( + featmap, with_stats != 0, len); + API_END(); +} diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 0d688b236..8d0e78a91 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -331,4 +331,24 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle, int with_stats, bst_ulong *out_len, const char ***out_dump_array); + +/*! + * \brief dump model, return array of strings representing model dump + * \param handle handle + * \param fnum number of features + * \param fnum names of features + * \param fnum types of features + * \param with_stats whether to dump with statistics + * \param out_len length of output array + * \param out_dump_array pointer to hold representing dump of each model + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle, + int fnum, + const char **fname, + const char **ftype, + int with_stats, + bst_ulong *len, + const char ***out_models); + #endif // XGBOOST_WRAPPER_H_ From 6063d243eb4efac77751a910362a2f143db3e141 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 15 Sep 2015 01:52:41 +0900 Subject: [PATCH 3/3] Mac build fix --- scripts/travis_osx_install.sh | 12 -------- scripts/travis_script.sh | 56 ++++++++++++++++++++++------------- tests/python/test_basic.py | 31 +++++++++++-------- 3 files changed, 53 insertions(+), 46 deletions(-) diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh index adc620a52..8c449c843 100755 --- a/scripts/travis_osx_install.sh +++ b/scripts/travis_osx_install.sh @@ -5,15 +5,3 @@ if [ ${TRAVIS_OS_NAME} != "osx" ]; then fi brew update - -if [ ${TASK} == "python-package" ]; then - brew install python git graphviz - easy_install pip - pip install numpy scipy matplotlib nose -fi - -if [ ${TASK} == "python-package3" ]; then - brew install python3 git graphviz - sudo pip3 install --upgrade setuptools - pip3 install numpy scipy matplotlib nose graphviz -fi diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index c5708b0c8..f633f9d7b 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -33,30 +33,44 @@ if [ ${TASK} == "R-package" ]; then scripts/travis_R_script.sh || exit -1 fi -if [ ${TASK} == "python-package" ]; then - sudo apt-get install graphviz - sudo apt-get install python-numpy python-scipy python-matplotlib python-nose - sudo python -m pip install graphviz - make all CXX=${CXX} || exit -1 - nosetests tests/python || exit -1 -fi +if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then -if [ ${TASK} == "python-package3" ]; then - sudo apt-get install graphviz - # python3-matplotlib is unavailale on Ubuntu 12.04 - sudo apt-get install python3-dev - sudo apt-get install python3-numpy python3-scipy python3-nose python3-setuptools - - make all CXX=${CXX} || exit -1 - - if [ ${TRAVIS_OS_NAME} != "osx" ]; then - sudo easy_install3 pip - sudo easy_install3 -U distribute - sudo pip install graphviz matplotlib - nosetests3 tests/python || exit -1 + if [ ${TRAVIS_OS_NAME} == "osx" ]; then + brew install graphviz + if [ ${TASK} == "python-package3" ]; then + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + else + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh + fi else - nosetests tests/python || exit -1 + sudo apt-get install graphviz + if [ ${TASK} == "python-package3" ]; then + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + else + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh + fi fi + bash conda.sh -b -p $HOME/miniconda + export PATH="$HOME/miniconda/bin:$PATH" + hash -r + conda config --set always_yes yes --set changeps1 no + conda update -q conda + # Useful for debugging any issues with conda + conda info -a + + if [ ${TASK} == "python-package3" ]; then + conda create -n myenv python=3.4 + else + conda create -n myenv python=2.7 + fi + source activate myenv + conda install numpy scipy matplotlib nose + python -m pip install graphviz + + make all CXX=${CXX} || exit -1 + + python -m nose tests/python || exit -1 + python --version fi # only test java under linux for now diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 70de2626c..bb6654f51 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import numpy as np import xgboost as xgb @@ -33,21 +34,25 @@ def test_feature_names(): data = np.random.randn(100, 5) target = np.array([0, 1] * 50) - features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] - dm = xgb.DMatrix(data, label=target, - feature_names=features) - assert dm.feature_names == features - assert dm.num_row() == 100 - assert dm.num_col() == 5 + cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], + [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] - params={'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} + for features in cases: + dm = xgb.DMatrix(data, label=target, + feature_names=features) + assert dm.feature_names == features + assert dm.num_row() == 100 + assert dm.num_col() == 5 + + params={'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + scores = bst.get_fscore() + assert list(sorted(k for k in scores)) == features - bst = xgb.train(params, dm, num_boost_round=10) - scores = bst.get_fscore() - assert list(sorted(k for k in scores)) == features def test_plotting(): bst2 = xgb.Booster(model_file='xgb.model')