diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 2718ca704..74170ea84 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -21,16 +21,66 @@ class XGBoostError(Exception): """Error throwed by xgboost trainer.""" pass +PY3 = (sys.version_info[0] == 3) -if sys.version_info[0] == 3: +if PY3: # pylint: disable=invalid-name, redefined-builtin STRING_TYPES = str, - unicode = str else: # pylint: disable=invalid-name STRING_TYPES = basestring, +def from_pystr_to_cstr(data): + """Convert a list of Python str to C pointer + + Parameters + ---------- + data : list + list of str + """ + + if isinstance(data, list): + pointers = (ctypes.c_char_p * len(data))() + if PY3: + data = [bytes(d, 'utf-8') for d in data] + else: + data = [d.encode('utf-8') if isinstance(d, unicode) else d + for d in data] + pointers[:] = data + return pointers + else: + # copy from above when we actually use it + raise NotImplementedError + + +def from_cstr_to_pystr(data, length): + """Revert C pointer to Python str + + Parameters + ---------- + data : ctypes pointer + pointer to data + length : ctypes pointer + pointer to length of data + """ + if PY3: + res = [] + for i in range(length.value): + try: + res.append(str(data[i].decode('ascii'))) + except UnicodeDecodeError: + res.append(str(data[i].decode('utf-8'))) + else: + res = [] + for i in range(length.value): + try: + res.append(str(data[i].decode('ascii'))) + except UnicodeDecodeError: + res.append(unicode(data[i].decode('utf-8'))) + return res + + def find_lib_path(): """Load find the path to xgboost dynamic library files. @@ -787,21 +837,12 @@ class Booster(object): sarr = ctypes.POINTER(ctypes.c_char_p)() if self.feature_names is not None and fmap == '': flen = int(len(self.feature_names)) - fname = (ctypes.c_char_p * flen)() - ftype = (ctypes.c_char_p * flen)() + + fname = from_pystr_to_cstr(self.feature_names) # supports quantitative type only # {'q': quantitative, 'i': indicator} - if sys.version_info[0] == 3: - features = [bytes(f, 'utf-8') for f in self.feature_names] - types = [bytes('q', 'utf-8')] * flen - else: - features = [f.encode('utf-8') if isinstance(f, unicode) else f - for f in self.feature_names] - types = ['q'] * flen - - fname[:] = features - ftype[:] = types + ftype = from_pystr_to_cstr(['q'] * flen) _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle, flen, fname, @@ -815,13 +856,7 @@ class Booster(object): int(with_stats), ctypes.byref(length), ctypes.byref(sarr))) - - res = [] - for i in range(length.value): - try: - res.append(str(sarr[i].decode('ascii'))) - except UnicodeDecodeError: - res.append(unicode(sarr[i].decode('utf-8'))) + res = from_cstr_to_pystr(sarr, length) return res def get_fscore(self, fmap=''): diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 111d389a0..404d4354f 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -6,106 +6,125 @@ import unittest dpath = 'demo/data/' - class TestBasic(unittest.TestCase): + def test_basic(self): + dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + # specify validations set to watch performance + watchlist = [(dtest,'eval'), (dtrain,'train')] + num_round = 2 + bst = xgb.train(param, dtrain, num_round, watchlist) + # this is prediction + preds = bst.predict(dtest) + labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + # error must be smaller than 10% + assert err < 0.1 + + # save dmatrix into binary buffer + dtest.save_binary('dtest.buffer') + # save model + bst.save_model('xgb.model') + # load model and data in + bst2 = xgb.Booster(model_file='xgb.model') + dtest2 = xgb.DMatrix('dtest.buffer') + preds2 = bst2.predict(dtest2) + # assert they are the same + assert np.sum(np.abs(preds2-preds)) == 0 + + def test_dmatrix_init(self): + data = np.random.randn(5, 5) + + # different length + self.assertRaises(ValueError, xgb.DMatrix, data, + feature_names=list('abcdef')) + # contains duplicates + self.assertRaises(ValueError, xgb.DMatrix, data, + feature_names=['a', 'b', 'c', 'd', 'd']) + # contains symbol + self.assertRaises(ValueError, xgb.DMatrix, data, + feature_names=['a', 'b', 'c', 'd', 'e=1']) + + def test_feature_names(self): + data = np.random.randn(100, 5) + target = np.array([0, 1] * 50) + + cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], + [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] + + for features in cases: + dm = xgb.DMatrix(data, label=target, + feature_names=features) + assert dm.feature_names == features + assert dm.num_row() == 100 + assert dm.num_col() == 5 + + params={'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + scores = bst.get_fscore() + assert list(sorted(k for k in scores)) == features + + dummy = np.random.randn(5, 5) + dm = xgb.DMatrix(dummy, feature_names=features) + bst.predict(dm) + + # different feature name must raises error + dm = xgb.DMatrix(dummy, feature_names=list('abcde')) + self.assertRaises(ValueError, bst.predict, dm) + def test_load_file_invalid(self): self.assertRaises(ValueError, xgb.Booster, model_file='incorrect_path') + def test_plotting(self): + bst2 = xgb.Booster(model_file='xgb.model') + # plotting -def test_basic(): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') - param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } - # specify validations set to watch performance - watchlist = [(dtest,'eval'), (dtrain,'train')] - num_round = 2 - bst = xgb.train(param, dtrain, num_round, watchlist) - # this is prediction - preds = bst.predict(dtest) - labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - # error must be smaller than 10% - assert err < 0.1 + import matplotlib + matplotlib.use('Agg') - # save dmatrix into binary buffer - dtest.save_binary('dtest.buffer') - # save model - bst.save_model('xgb.model') - # load model and data in - bst2 = xgb.Booster(model_file='xgb.model') - dtest2 = xgb.DMatrix('dtest.buffer') - preds2 = bst2.predict(dtest2) - # assert they are the same - assert np.sum(np.abs(preds2-preds)) == 0 + from matplotlib.axes import Axes + from graphviz import Digraph -def test_feature_names(): - data = np.random.randn(100, 5) - target = np.array([0, 1] * 50) + ax = xgb.plot_importance(bst2) + assert isinstance(ax, Axes) + assert ax.get_title() == 'Feature importance' + assert ax.get_xlabel() == 'F score' + assert ax.get_ylabel() == 'Features' + assert len(ax.patches) == 4 - cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], - [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] - - for features in cases: - dm = xgb.DMatrix(data, label=target, - feature_names=features) - assert dm.feature_names == features - assert dm.num_row() == 100 - assert dm.num_col() == 5 - - params={'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} - - bst = xgb.train(params, dm, num_boost_round=10) - scores = bst.get_fscore() - assert list(sorted(k for k in scores)) == features + ax = xgb.plot_importance(bst2, color='r', + title='t', xlabel='x', ylabel='y') + assert isinstance(ax, Axes) + assert ax.get_title() == 't' + assert ax.get_xlabel() == 'x' + assert ax.get_ylabel() == 'y' + assert len(ax.patches) == 4 + for p in ax.patches: + assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red -def test_plotting(): - bst2 = xgb.Booster(model_file='xgb.model') - # plotting + ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], + title=None, xlabel=None, ylabel=None) + assert isinstance(ax, Axes) + assert ax.get_title() == '' + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == '' + assert len(ax.patches) == 4 + assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red + assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red + assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue + assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue - import matplotlib - matplotlib.use('Agg') + g = xgb.to_graphviz(bst2, num_trees=0) + assert isinstance(g, Digraph) + ax = xgb.plot_tree(bst2, num_trees=0) + assert isinstance(ax, Axes) - from matplotlib.axes import Axes - from graphviz import Digraph - - ax = xgb.plot_importance(bst2) - assert isinstance(ax, Axes) - assert ax.get_title() == 'Feature importance' - assert ax.get_xlabel() == 'F score' - assert ax.get_ylabel() == 'Features' - assert len(ax.patches) == 4 - - ax = xgb.plot_importance(bst2, color='r', - title='t', xlabel='x', ylabel='y') - assert isinstance(ax, Axes) - assert ax.get_title() == 't' - assert ax.get_xlabel() == 'x' - assert ax.get_ylabel() == 'y' - assert len(ax.patches) == 4 - for p in ax.patches: - assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red - - - ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], - title=None, xlabel=None, ylabel=None) - assert isinstance(ax, Axes) - assert ax.get_title() == '' - assert ax.get_xlabel() == '' - assert ax.get_ylabel() == '' - assert len(ax.patches) == 4 - assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red - assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red - assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue - assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue - - g = xgb.to_graphviz(bst2, num_trees=0) - assert isinstance(g, Digraph) - ax = xgb.plot_tree(bst2, num_trees=0) - assert isinstance(ax, Axes)