diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0048a1462..2114af375 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -33,8 +33,9 @@ List of Contributors - Skipper is the major contributor to the scikit-learn module of xgboost. * [Zygmunt ZajÄ…c](https://github.com/zygmuntz) - Zygmunt is the master behind the early stopping feature frequently used by kagglers. -* [Ajinkya Kale](https://github.com/ajkl) * [Yuan Tang](https://github.com/terrytangyuan) + - Yuan is the major contributor to unit tests in R and Python. +* [Ajinkya Kale](https://github.com/ajkl) * [Boliang Chen](https://github.com/cblsjtu) * [Vadim Khotilovich](https://github.com/khotilov) * [Yangqing Men](https://github.com/yanqingmen) diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 710af8e4c..79288b371 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -5,6 +5,7 @@ import unittest dpath = 'demo/data/' +rng = np.random.RandomState(1994) class TestBasic(unittest.TestCase): diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py new file mode 100644 index 000000000..6190d6286 --- /dev/null +++ b/tests/python/test_early_stopping.py @@ -0,0 +1,19 @@ +import xgboost as xgb +import numpy as np +from sklearn.datasets import load_digits +from sklearn.cross_validation import KFold, train_test_split + +rng = np.random.RandomState(1994) + +def test_early_stopping_nonparallel(): + # digits = load_digits(2) + # X = digits['data'] + # y = digits['target'] + # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + # clf = xgb.XGBClassifier() + # clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", + # eval_set=[(X_test, y_test)]) + print("This test will be re-visited later. ") + +# TODO: parallel test for early stopping +# TODO: comment out for now. Will re-visit later \ No newline at end of file diff --git a/tests/python/test_models.py b/tests/python/test_models.py index 8c06d9de9..a49dc4887 100644 --- a/tests/python/test_models.py +++ b/tests/python/test_models.py @@ -5,6 +5,8 @@ dpath = 'demo/data/' dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') +rng = np.random.RandomState(1994) + def test_glm(): param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 } watchlist = [(dtest,'eval'), (dtrain,'train')] @@ -29,6 +31,8 @@ def test_custom_objective(): def evalerror(preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + + # test custom_objective in training bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) @@ -36,4 +40,23 @@ def test_custom_objective(): err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) assert err < 0.1 + # test custom_objective in cross-validation + xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, + obj = logregobj, feval=evalerror) +def test_fpreproc(): + param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} + num_round = 2 + def fpreproc(dtrain, dtest, param): + label = dtrain.get_label() + ratio = float(np.sum(label == 0)) / np.sum(label==1) + param['scale_pos_weight'] = ratio + return (dtrain, dtest, param) + xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'auc'}, seed = 0, fpreproc = fpreproc) + +def test_show_stdv(): + param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} + num_round = 2 + xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'error'}, seed = 0, show_stdv = False) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py new file mode 100644 index 000000000..f32374d56 --- /dev/null +++ b/tests/python/test_with_sklearn.py @@ -0,0 +1,57 @@ +import xgboost as xgb +import numpy as np +from sklearn.cross_validation import KFold, train_test_split +from sklearn.metrics import mean_squared_error +from sklearn.grid_search import GridSearchCV +from sklearn.datasets import load_iris, load_digits, load_boston + +rng = np.random.RandomState(1994) + +def test_binary_classification(): + digits = load_digits(2) + y = digits['target'] + X = digits['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + preds = xgb_model.predict(X[test_index]) + labels = y[test_index] + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + assert err < 0.1 + +def test_multiclass_classification(): + iris = load_iris() + y = iris['target'] + X = iris['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + preds = xgb_model.predict(X[test_index]) + labels = y[test_index] + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + assert err < 0.4 + +def test_boston_housing_regression(): + boston = load_boston() + y = boston['target'] + X = boston['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) + preds = xgb_model.predict(X[test_index]) + labels = y[test_index] + assert mean_squared_error(preds, labels) < 15 + +def test_parameter_tuning(): + boston = load_boston() + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1) + clf.fit(X,y) + assert clf.best_score_ < 0.7 + assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4} + +