EX: Make separate example for fork issue.

This commit is contained in:
Skipper Seabold 2015-05-11 09:30:51 -05:00
parent fa8c6e2f0b
commit 15ea00540a
2 changed files with 89 additions and 76 deletions

View File

@ -4,86 +4,64 @@ Created on 1 Apr 2015
@author: Jamie Hall @author: Jamie Hall
''' '''
if __name__ == "__main__": import pickle
# NOTE: This *has* to be here and in the `__name__ == "__main__"` clause import xgboost as xgb
# to run XGBoost in parallel, if XGBoost was built with OpenMP support.
# Otherwise, you can use fork, which is the default backend for joblib,
# and omit this.
from multiprocessing import set_start_method
set_start_method("forkserver")
import pickle import numpy as np
import os from sklearn.cross_validation import KFold
import xgboost as xgb from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_boston
import numpy as np rng = np.random.RandomState(31337)
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
rng = np.random.RandomState(31337) print("Zeros and Ones from the Digits dataset: binary classification")
digits = load_digits(2)
print("Zeros and Ones from the Digits dataset: binary classification") y = digits['target']
digits = load_digits(2) X = digits['data']
y = digits['target'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
X = digits['data'] for train_index, test_index in kf:
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index]) predictions = xgb_model.predict(X[test_index])
actuals = y[test_index] actuals = y[test_index]
print(confusion_matrix(actuals, predictions)) print(confusion_matrix(actuals, predictions))
print("Iris: multiclass classification") print("Iris: multiclass classification")
iris = load_iris() iris = load_iris()
y = iris['target'] y = iris['target']
X = iris['data'] X = iris['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf: for train_index, test_index in kf:
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index]) predictions = xgb_model.predict(X[test_index])
actuals = y[test_index] actuals = y[test_index]
print(confusion_matrix(actuals, predictions)) print(confusion_matrix(actuals, predictions))
print("Boston Housing: regression") print("Boston Housing: regression")
boston = load_boston() boston = load_boston()
y = boston['target'] y = boston['target']
X = boston['data'] X = boston['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf: for train_index, test_index in kf:
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index]) predictions = xgb_model.predict(X[test_index])
actuals = y[test_index] actuals = y[test_index]
print(mean_squared_error(actuals, predictions)) print(mean_squared_error(actuals, predictions))
print("Parameter optimization") print("Parameter optimization")
y = boston['target'] y = boston['target']
X = boston['data'] X = boston['data']
xgb_model = xgb.XGBRegressor() xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6], {'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1) 'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y) clf.fit(X,y)
print(clf.best_score_) print(clf.best_score_)
print(clf.best_params_) print(clf.best_params_)
# The sklearn API models are picklable # The sklearn API models are picklable
print("Pickling sklearn API models") print("Pickling sklearn API models")
# must open in binary format to pickle # must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb")) pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb")) clf2 = pickle.load(open("best_boston.pkl", "rb"))
print(np.allclose(clf.predict(X), clf2.predict(X))) print(np.allclose(clf.predict(X), clf2.predict(X)))
print("Parallel Parameter optimization")
os.environ["OMP_NUM_THREADS"] = "1"
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1,
n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

View File

@ -0,0 +1,35 @@
import os
if __name__ == "__main__":
# NOTE: on posix systems, this *has* to be here and in the
# `__name__ == "__main__"` clause to run XGBoost in parallel processes
# using fork, if XGBoost was built with OpenMP support. Otherwise, if you
# build XGBoost without OpenMP support, you can use fork, which is the
# default backend for joblib, and omit this.
try:
from multiprocessing import set_start_method
except ImportError:
raise ImportError("Unable to import multiprocessing.set_start_method."
" This example only runs on Python 3.4")
set_start_method("forkserver")
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_boston
import xgboost as xgb
rng = np.random.RandomState(31337)
print("Parallel Parameter optimization")
boston = load_boston()
os.environ["OMP_NUM_THREADS"] = "2" # or to whatever you want
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)