diff --git a/.travis.yml b/.travis.yml index 4f09eb083..4ba2ad873 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ env: - TASK=r_test # python package test - TASK=python_test + - TASK=python_lightweight_test # java package test - TASK=java_test diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 96f705a68..e57ff77e9 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -42,7 +42,7 @@ def plot_importance(booster, ax=None, height=0.2, ------- ax : matplotlib Axes """ - + # TODO: move this to compat.py try: import matplotlib.pyplot as plt except ImportError: diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index d7a6e2619..0d5f55f3c 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -3,10 +3,6 @@ import numpy as np import xgboost as xgb import unittest -import matplotlib - -matplotlib.use('Agg') - dpath = 'demo/data/' rng = np.random.RandomState(1994) @@ -102,86 +98,6 @@ class TestBasic(unittest.TestCase): dm = xgb.DMatrix(dummy, feature_names=list('abcde')) self.assertRaises(ValueError, bst.predict, dm) - def test_pandas(self): - import pandas as pd - df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) - dm = xgb.DMatrix(df, label=pd.Series([1, 2])) - assert dm.feature_names == ['a', 'b', 'c'] - assert dm.feature_types == ['int', 'float', 'i'] - assert dm.num_row() == 2 - assert dm.num_col() == 3 - - # overwrite feature_names and feature_types - dm = xgb.DMatrix(df, label=pd.Series([1, 2]), - feature_names=['x', 'y', 'z'], feature_types=['q', 'q', 'q']) - assert dm.feature_names == ['x', 'y', 'z'] - assert dm.feature_types == ['q', 'q', 'q'] - assert dm.num_row() == 2 - assert dm.num_col() == 3 - - # incorrect dtypes - df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], columns=['a', 'b', 'c']) - self.assertRaises(ValueError, xgb.DMatrix, df) - - # numeric columns - df = pd.DataFrame([[1, 2., True], [2, 3., False]]) - dm = xgb.DMatrix(df, label=pd.Series([1, 2])) - assert dm.feature_names == ['0', '1', '2'] - assert dm.feature_types == ['int', 'float', 'i'] - assert dm.num_row() == 2 - assert dm.num_col() == 3 - - df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6]) - dm = xgb.DMatrix(df, label=pd.Series([1, 2])) - assert dm.feature_names == ['4', '5', '6'] - assert dm.feature_types == ['int', 'float', 'int'] - assert dm.num_row() == 2 - assert dm.num_col() == 3 - - df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) - dummies = pd.get_dummies(df) - # B A_X A_Y A_Z - # 0 1 1 0 0 - # 1 2 0 1 0 - # 2 3 0 0 1 - result, _, _ = xgb.core._maybe_pandas_data(dummies, None, None) - exp = np.array([[1., 1., 0., 0.], - [2., 0., 1., 0.], - [3., 0., 0., 1.]]) - np.testing.assert_array_equal(result, exp) - - dm = xgb.DMatrix(dummies) - assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z'] - assert dm.feature_types == ['int', 'float', 'float', 'float'] - assert dm.num_row() == 3 - assert dm.num_col() == 4 - - df = pd.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]}) - dm = xgb.DMatrix(df) - assert dm.feature_names == ['A=1', 'A=2'] - assert dm.feature_types == ['int', 'int'] - assert dm.num_row() == 3 - assert dm.num_col() == 2 - - def test_pandas_label(self): - import pandas as pd - - # label must be a single column - df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) - self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df) - - # label must be supported dtype - df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)}) - self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df) - - df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)}) - result = xgb.core._maybe_pandas_label(df) - np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float)) - - dm = xgb.DMatrix(np.random.randn(3, 2), label=df) - assert dm.num_row() == 3 - assert dm.num_col() == 2 - def test_load_file_invalid(self): self.assertRaises(xgb.core.XGBoostError, xgb.Booster, model_file='incorrect_path') @@ -215,168 +131,8 @@ class TestBasic(unittest.TestCase): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} - import pandas as pd - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10) - assert isinstance(cv, pd.DataFrame) - exp = pd.Index([u'test-error-mean', u'test-error-std', - u'train-error-mean', u'train-error-std']) - assert cv.columns.equals(exp) - - # show progress log (result is the same as above) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - verbose_eval=True) - assert isinstance(cv, pd.DataFrame) - exp = pd.Index([u'test-error-mean', u'test-error-std', - u'train-error-mean', u'train-error-std']) - assert cv.columns.equals(exp) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - verbose_eval=True, show_stdv=False) - assert isinstance(cv, pd.DataFrame) - exp = pd.Index([u'test-error-mean', u'test-error-std', - u'train-error-mean', u'train-error-std']) - assert cv.columns.equals(exp) - # return np.ndarray cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False) assert isinstance(cv, np.ndarray) assert cv.shape == (10, 4) - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) - assert 'eval_metric' in params - assert 'auc' in cv.columns[0] - - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) - assert 'eval_metric' in params - assert 'auc' in cv.columns[0] - - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, early_stopping_rounds=1) - assert 'eval_metric' in params - assert 'auc' in cv.columns[0] - assert cv.shape[0] < 10 - - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='auc') - assert 'auc' in cv.columns[0] - - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['auc']) - assert 'auc' in cv.columns[0] - - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='error') - assert 'eval_metric' in params - assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] - - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) - assert 'eval_metric' in params - assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] - - params = list(params.items()) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) - assert isinstance(params, list) - assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] - - def test_plotting(self): - bst2 = xgb.Booster(model_file='xgb.model') - # plotting - - from matplotlib.axes import Axes - from graphviz import Digraph - - ax = xgb.plot_importance(bst2) - assert isinstance(ax, Axes) - assert ax.get_title() == 'Feature importance' - assert ax.get_xlabel() == 'F score' - assert ax.get_ylabel() == 'Features' - assert len(ax.patches) == 4 - - ax = xgb.plot_importance(bst2, color='r', - title='t', xlabel='x', ylabel='y') - assert isinstance(ax, Axes) - assert ax.get_title() == 't' - assert ax.get_xlabel() == 'x' - assert ax.get_ylabel() == 'y' - assert len(ax.patches) == 4 - for p in ax.patches: - assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red - - ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], - title=None, xlabel=None, ylabel=None) - assert isinstance(ax, Axes) - assert ax.get_title() == '' - assert ax.get_xlabel() == '' - assert ax.get_ylabel() == '' - assert len(ax.patches) == 4 - assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red - assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red - assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue - assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue - - g = xgb.to_graphviz(bst2, num_trees=0) - assert isinstance(g, Digraph) - - ax = xgb.plot_tree(bst2, num_trees=0) - assert isinstance(ax, Axes) - - def test_importance_plot_lim(self): - np.random.seed(1) - dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50) - bst = xgb.train({}, dm) - assert len(bst.get_fscore()) == 71 - ax = xgb.plot_importance(bst) - assert ax.get_xlim() == (0., 11.) - assert ax.get_ylim() == (-1., 71.) - - ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71)) - assert ax.get_xlim() == (0., 5.) - assert ax.get_ylim() == (10., 71.) - - def test_sklearn_api(self): - from sklearn import datasets - from sklearn.cross_validation import train_test_split - - np.random.seed(1) - - iris = datasets.load_iris() - tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120) - - classifier = xgb.XGBClassifier() - classifier.fit(tr_d, tr_l) - - preds = classifier.predict(te_d) - labels = te_l - err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l) - # error must be smaller than 10% - assert err < 0.1 - - def test_sklearn_plotting(self): - from sklearn import datasets - iris = datasets.load_iris() - - classifier = xgb.XGBClassifier() - classifier.fit(iris.data, iris.target) - - import matplotlib - matplotlib.use('Agg') - - from matplotlib.axes import Axes - from graphviz import Digraph - - ax = xgb.plot_importance(classifier) - assert isinstance(ax, Axes) - assert ax.get_title() == 'Feature importance' - assert ax.get_xlabel() == 'F score' - assert ax.get_ylabel() == 'Features' - assert len(ax.patches) == 4 - - g = xgb.to_graphviz(classifier, num_trees=0) - assert isinstance(g, Digraph) - - ax = xgb.plot_tree(classifier, num_trees=0) - assert isinstance(ax, Axes) diff --git a/tests/python/test_models.py b/tests/python/test_basic_models.py similarity index 100% rename from tests/python/test_models.py rename to tests/python/test_basic_models.py diff --git a/tests/python/test_cv.py b/tests/python/test_cv.py deleted file mode 100644 index b4ad8913c..000000000 --- a/tests/python/test_cv.py +++ /dev/null @@ -1,37 +0,0 @@ -import xgboost as xgb -import numpy as np -from sklearn.datasets import load_digits -from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split -from sklearn.metrics import mean_squared_error -import unittest - -rng = np.random.RandomState(1994) - - -class TestCrossValidation(unittest.TestCase): - def test_cv(self): - digits = load_digits(3) - X = digits['data'] - y = digits['target'] - dm = xgb.DMatrix(X, label=y) - - params = { - 'max_depth': 2, - 'eta': 1, - 'silent': 1, - 'objective': - 'multi:softprob', - 'num_class': 3 - } - - seed = 2016 - nfolds = 5 - skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed) - - import pandas as pd - cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed) - cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed) - cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed) - assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] - assert cv2.iloc[-1,0] == cv3.iloc[-1,0] - diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index 512fd20d0..6d1895fb1 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -29,9 +29,6 @@ class TestEarlyStopping(unittest.TestCase): eval_set=[(X_test, y_test)]) assert clf3.best_score == 1 - # TODO: parallel test for early stopping - # TODO: comment out for now. Will re-visit later - def evalerror(self, preds, dtrain): labels = dtrain.get_label() return 'rmse', mean_squared_error(labels, preds) diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py new file mode 100644 index 000000000..7f6123bce --- /dev/null +++ b/tests/python/test_plotting.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +import numpy as np +import xgboost as xgb +import unittest + +import matplotlib +from matplotlib.axes import Axes +from graphviz import Digraph + +matplotlib.use('Agg') + +dpath = 'demo/data/' +rng = np.random.RandomState(1994) + +class TestPlotting(unittest.TestCase): + def test_plotting(self): + bst2 = xgb.Booster(model_file='xgb.model') + + ax = xgb.plot_importance(bst2) + assert isinstance(ax, Axes) + assert ax.get_title() == 'Feature importance' + assert ax.get_xlabel() == 'F score' + assert ax.get_ylabel() == 'Features' + assert len(ax.patches) == 4 + + ax = xgb.plot_importance(bst2, color='r', + title='t', xlabel='x', ylabel='y') + assert isinstance(ax, Axes) + assert ax.get_title() == 't' + assert ax.get_xlabel() == 'x' + assert ax.get_ylabel() == 'y' + assert len(ax.patches) == 4 + for p in ax.patches: + assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red + + ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], + title=None, xlabel=None, ylabel=None) + assert isinstance(ax, Axes) + assert ax.get_title() == '' + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == '' + assert len(ax.patches) == 4 + assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red + assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red + assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue + assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue + + g = xgb.to_graphviz(bst2, num_trees=0) + assert isinstance(g, Digraph) + + ax = xgb.plot_tree(bst2, num_trees=0) + assert isinstance(ax, Axes) + + def test_importance_plot_lim(self): + np.random.seed(1) + dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50) + bst = xgb.train({}, dm) + assert len(bst.get_fscore()) == 71 + ax = xgb.plot_importance(bst) + assert ax.get_xlim() == (0., 11.) + assert ax.get_ylim() == (-1., 71.) + + ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71)) + assert ax.get_xlim() == (0., 5.) + assert ax.get_ylim() == (10., 71.) diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py new file mode 100644 index 000000000..f5ceb6fc2 --- /dev/null +++ b/tests/python/test_with_pandas.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- +import numpy as np +import xgboost as xgb +import unittest +import pandas as pd + +dpath = 'demo/data/' +rng = np.random.RandomState(1994) + + +class TestPandas(unittest.TestCase): + def test_pandas(self): + df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['a', 'b', 'c'] + assert dm.feature_types == ['int', 'float', 'i'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + # overwrite feature_names and feature_types + dm = xgb.DMatrix(df, label=pd.Series([1, 2]), + feature_names=['x', 'y', 'z'], feature_types=['q', 'q', 'q']) + assert dm.feature_names == ['x', 'y', 'z'] + assert dm.feature_types == ['q', 'q', 'q'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + # incorrect dtypes + df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], columns=['a', 'b', 'c']) + self.assertRaises(ValueError, xgb.DMatrix, df) + + # numeric columns + df = pd.DataFrame([[1, 2., True], [2, 3., False]]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['0', '1', '2'] + assert dm.feature_types == ['int', 'float', 'i'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['4', '5', '6'] + assert dm.feature_types == ['int', 'float', 'int'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) + dummies = pd.get_dummies(df) + # B A_X A_Y A_Z + # 0 1 1 0 0 + # 1 2 0 1 0 + # 2 3 0 0 1 + result, _, _ = xgb.core._maybe_pandas_data(dummies, None, None) + exp = np.array([[1., 1., 0., 0.], + [2., 0., 1., 0.], + [3., 0., 0., 1.]]) + np.testing.assert_array_equal(result, exp) + + dm = xgb.DMatrix(dummies) + assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z'] + assert dm.feature_types == ['int', 'float', 'float', 'float'] + assert dm.num_row() == 3 + assert dm.num_col() == 4 + + df = pd.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]}) + dm = xgb.DMatrix(df) + assert dm.feature_names == ['A=1', 'A=2'] + assert dm.feature_types == ['int', 'int'] + assert dm.num_row() == 3 + assert dm.num_col() == 2 + + def test_pandas_label(self): + # label must be a single column + df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) + self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df) + + # label must be supported dtype + df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)}) + self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df) + + df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)}) + result = xgb.core._maybe_pandas_label(df) + np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float)) + + dm = xgb.DMatrix(np.random.randn(3, 2), label=df) + assert dm.num_row() == 3 + assert dm.num_col() == 2 + + def test_cv_as_pandas(self): + dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} + + import pandas as pd + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10) + assert isinstance(cv, pd.DataFrame) + exp = pd.Index([u'test-error-mean', u'test-error-std', + u'train-error-mean', u'train-error-std']) + assert cv.columns.equals(exp) + + # show progress log (result is the same as above) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + verbose_eval=True) + assert isinstance(cv, pd.DataFrame) + exp = pd.Index([u'test-error-mean', u'test-error-std', + u'train-error-mean', u'train-error-std']) + assert cv.columns.equals(exp) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + verbose_eval=True, show_stdv=False) + assert isinstance(cv, pd.DataFrame) + exp = pd.Index([u'test-error-mean', u'test-error-std', + u'train-error-mean', u'train-error-std']) + assert cv.columns.equals(exp) + + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc'} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) + assert 'eval_metric' in params + assert 'auc' in cv.columns[0] + + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) + assert 'eval_metric' in params + assert 'auc' in cv.columns[0] + + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, early_stopping_rounds=1) + assert 'eval_metric' in params + assert 'auc' in cv.columns[0] + assert cv.shape[0] < 10 + + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='auc') + assert 'auc' in cv.columns[0] + + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['auc']) + assert 'auc' in cv.columns[0] + + params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='error') + assert 'eval_metric' in params + assert 'auc' not in cv.columns[0] + assert 'error' in cv.columns[0] + + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) + assert 'eval_metric' in params + assert 'auc' not in cv.columns[0] + assert 'error' in cv.columns[0] + + params = list(params.items()) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) + assert isinstance(params, list) + assert 'auc' not in cv.columns[0] + assert 'error' in cv.columns[0] \ No newline at end of file diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 5cfe40891..161fe30a6 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -4,6 +4,7 @@ from sklearn.cross_validation import KFold from sklearn.metrics import mean_squared_error from sklearn.grid_search import GridSearchCV from sklearn.datasets import load_iris, load_digits, load_boston +from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split rng = np.random.RandomState(1994) @@ -130,3 +131,65 @@ def test_classification_with_custom_objective(): X, y ) +def test_sklearn_api(): + iris = load_iris() + tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120) + + classifier = xgb.XGBClassifier() + classifier.fit(tr_d, tr_l) + + preds = classifier.predict(te_d) + labels = te_l + err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l) + assert err < 0.2 + +def test_sklearn_plotting(): + iris = load_iris() + + classifier = xgb.XGBClassifier() + classifier.fit(iris.data, iris.target) + + import matplotlib + matplotlib.use('Agg') + + from matplotlib.axes import Axes + from graphviz import Digraph + + ax = xgb.plot_importance(classifier) + assert isinstance(ax, Axes) + assert ax.get_title() == 'Feature importance' + assert ax.get_xlabel() == 'F score' + assert ax.get_ylabel() == 'Features' + assert len(ax.patches) == 4 + + g = xgb.to_graphviz(classifier, num_trees=0) + assert isinstance(g, Digraph) + + ax = xgb.plot_tree(classifier, num_trees=0) + assert isinstance(ax, Axes) + +def test_sklearn_nfolds_cv(): + digits = load_digits(3) + X = digits['data'] + y = digits['target'] + dm = xgb.DMatrix(X, label=y) + + params = { + 'max_depth': 2, + 'eta': 1, + 'silent': 1, + 'objective': + 'multi:softprob', + 'num_class': 3 + } + + seed = 2016 + nfolds = 5 + skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed) + + import pandas as pd + cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed) + cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed) + cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed) + assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] + assert cv2.iloc[-1,0] == cv3.iloc[-1,0] \ No newline at end of file diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index ec41bc8a3..5795d89ff 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -38,6 +38,23 @@ if [ ${TASK} == "python_test" ]; then exit 0 fi +if [ ${TASK} == "python_lightweight_test" ]; then + make all || exit -1 + echo "-------------------------------" + source activate python3 + python --version + conda install numpy scipy nose + python -m pip install graphviz + python -m nose tests/python/test_basic*.py || exit -1 + source activate python2 + echo "-------------------------------" + python --version + conda install numpy scipy nose + python -m pip install graphviz + python -m nose tests/python/test_basic*.py || exit -1 + exit 0 +fi + if [ ${TASK} == "r_test" ]; then set -e export _R_CHECK_TIMINGS_=0 diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 63f30207f..b74cb90fc 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -10,7 +10,7 @@ if [ ${TASK} == "lint" ]; then fi -if [ ${TASK} == "python_test" ]; then +if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_lightweight_test" ]; then # python2 if [ ${TRAVIS_OS_NAME} == "osx" ]; then wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh